From 39d395f75c306a0d932a783eef039fd93d66e246 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 7 Aug 2024 12:10:43 -0500
Subject: [PATCH 01/58] LLDB Support for AIX

---
 clang/lib/CodeGen/CGObjCMac.cpp               |    6 +-
 lldb/CMakeLists.txt                           |    4 +
 lldb/cmake/modules/LLDBConfig.cmake           |    2 +-
 lldb/include/lldb/Core/Module.h               |    3 +
 lldb/include/lldb/Core/ModuleSpec.h           |   23 +-
 lldb/include/lldb/Host/HostGetOpt.h           |    2 +-
 lldb/include/lldb/Host/HostInfo.h             |    3 +
 lldb/include/lldb/Host/HostInfoBase.h         |    2 +-
 lldb/include/lldb/Host/XML.h                  |    5 +
 lldb/include/lldb/Host/aix/AbstractSocket.h   |   25 +
 lldb/include/lldb/Host/aix/Host.h             |   22 +
 lldb/include/lldb/Host/aix/HostInfoAIX.h      |   42 +
 lldb/include/lldb/Host/aix/Ptrace.h           |   62 +
 lldb/include/lldb/Host/aix/Support.h          |   29 +
 lldb/include/lldb/Host/aix/Uio.h              |   23 +
 lldb/include/lldb/Host/common/GetOptInc.h     |    6 +-
 lldb/include/lldb/Symbol/ObjectFile.h         |    5 +
 lldb/include/lldb/Target/ABI.h                |    6 +
 lldb/include/lldb/Target/DynamicLoader.h      |    6 +
 lldb/include/lldb/Target/Process.h            |   14 +
 .../lldb/Target/RegisterContextUnwind.h       |    4 +
 .../lldb/Target/ThreadPlanCallFunction.h      |    6 +
 .../lldb/Utility/StringExtractorGDBRemote.h   |    1 +
 lldb/include/lldb/lldb-private-enumerations.h |    1 +
 lldb/source/API/CMakeLists.txt                |  108 +
 lldb/source/API/SBBreakpoint.cpp              |    6 +-
 lldb/source/API/SBBreakpointLocation.cpp      |    6 +-
 lldb/source/API/SBBreakpointName.cpp          |    4 +-
 lldb/source/Core/DynamicLoader.cpp            |   10 +
 lldb/source/Core/Mangled.cpp                  |    2 +
 lldb/source/Core/Module.cpp                   |   12 +
 lldb/source/Core/Section.cpp                  |    4 +
 lldb/source/Expression/DWARFExpression.cpp    |   10 +-
 lldb/source/Host/CMakeLists.txt               |   13 +
 lldb/source/Host/aix/AbstractSocket.cpp       |   21 +
 lldb/source/Host/aix/Host.cpp                 |  304 +++
 lldb/source/Host/aix/HostInfoAIX.cpp          |  215 ++
 lldb/source/Host/aix/Support.cpp              |   44 +
 lldb/source/Host/common/GetOptInc.cpp         |    2 +-
 lldb/source/Host/common/Host.cpp              |  180 +-
 .../source/Host/common/LICENSE.aix-netbsd.txt |  125 +
 lldb/source/Host/common/XML.cpp               |    3 +
 .../posix/ConnectionFileDescriptorPosix.cpp   |    2 +
 lldb/source/Host/posix/FileSystemPosix.cpp    |    2 +
 lldb/source/Host/posix/MainLoopPosix.cpp      |   17 +
 .../Host/posix/ProcessLauncherPosixFork.cpp   |    5 +
 lldb/source/Initialization/CMakeLists.txt     |    2 +-
 .../SystemInitializerCommon.cpp               |    4 +-
 .../Plugins/ABI/PowerPC/ABISysV_ppc64.cpp     |  131 +-
 .../Plugins/ABI/PowerPC/ABISysV_ppc64.h       |    6 +
 .../DynamicLoader/AIX-DYLD/CMakeLists.txt     |   11 +
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         |  272 +++
 .../AIX-DYLD/DynamicLoaderAIXDYLD.h           |   55 +
 .../Plugins/DynamicLoader/CMakeLists.txt      |    1 +
 .../DynamicLoaderDarwinKernel.cpp             |    4 +-
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp       |    2 +-
 .../PPC64/EmulateInstructionPPC64.cpp         |  196 +-
 .../PPC64/EmulateInstructionPPC64.h           |   14 +
 ...nstrumentationRuntimeMainThreadChecker.cpp |    2 +-
 .../TSan/InstrumentationRuntimeTSan.cpp       |   14 +-
 .../UBSan/InstrumentationRuntimeUBSan.cpp     |    2 +-
 .../Plugins/JITLoader/GDB/JITLoaderGDB.cpp    |    4 +
 lldb/source/Plugins/Language/ObjC/Cocoa.cpp   |    2 +
 .../MemoryHistory/asan/MemoryHistoryASan.cpp  |    2 +-
 .../BSD-Archive/ObjectContainerBSDArchive.cpp |    2 +-
 .../Big-Archive/CMakeLists.txt                |   10 +
 .../Big-Archive/ObjectContainerBigArchive.cpp |  522 +++++
 .../Big-Archive/ObjectContainerBigArchive.h   |  177 ++
 .../Plugins/ObjectContainer/CMakeLists.txt    |    1 +
 lldb/source/Plugins/ObjectFile/CMakeLists.txt |    1 +
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     |    6 +-
 .../Minidump/ObjectFileMinidump.cpp           |    2 +
 .../Plugins/ObjectFile/PDB/ObjectFilePDB.cpp  |   15 +-
 .../ObjectFile/PECOFF/ObjectFilePECOFF.cpp    |   18 +-
 .../Plugins/ObjectFile/XCOFF/CMakeLists.txt   |   13 +
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      |  780 +++++++
 .../ObjectFile/XCOFF/ObjectFileXCOFF.h        |  243 ++
 .../Python/OperatingSystemPython.cpp          |    2 +-
 .../Plugins/Platform/AIX/CMakeLists.txt       |   13 +
 .../Plugins/Platform/AIX/PlatformAIX.cpp      |  471 ++++
 .../source/Plugins/Platform/AIX/PlatformAIX.h |   74 +
 lldb/source/Plugins/Platform/CMakeLists.txt   |    1 +
 .../source/Plugins/Process/AIX/CMakeLists.txt |   19 +
 .../Plugins/Process/AIX/NativeProcessAIX.cpp  | 2048 +++++++++++++++++
 .../Plugins/Process/AIX/NativeProcessAIX.h    |  283 +++
 .../Process/AIX/NativeRegisterContextAIX.cpp  |  157 ++
 .../Process/AIX/NativeRegisterContextAIX.h    |  133 ++
 .../AIX/NativeRegisterContextAIX_ppc64.cpp    |  744 ++++++
 .../AIX/NativeRegisterContextAIX_ppc64.h      |  138 ++
 .../Plugins/Process/AIX/NativeThreadAIX.cpp   |  526 +++++
 .../Plugins/Process/AIX/NativeThreadAIX.h     |  126 +
 lldb/source/Plugins/Process/CMakeLists.txt    |    3 +
 .../Process/Utility/InferiorCallPOSIX.cpp     |   33 +
 .../Utility/RegisterInfoPOSIX_ppc64le.cpp     |    4 +
 .../Plugins/Process/Utility/ThreadMemory.cpp  |    2 +-
 .../Plugins/Process/gdb-remote/CMakeLists.txt |    5 +
 .../GDBRemoteCommunicationClient.cpp          |   30 +
 .../gdb-remote/GDBRemoteCommunicationClient.h |    7 +
 .../GDBRemoteCommunicationServerLLGS.cpp      |   28 +
 .../GDBRemoteCommunicationServerLLGS.h        |    2 +
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |   13 +-
 .../Process/gdb-remote/ProcessGDBRemote.h     |    8 +
 .../Process/mach-core/ProcessMachCore.cpp     |    8 +-
 .../ScriptInterpreter/Python/CMakeLists.txt   |    5 +
 .../SymbolFile/DWARF/DWARFFormValue.cpp       |    4 +
 .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp    |   12 +-
 .../MacOSX/AppleGetThreadItemInfoHandler.cpp  |    2 +-
 lldb/source/Symbol/DWARFCallFrameInfo.cpp     |    4 +-
 lldb/source/Target/ABI.cpp                    |    9 +
 lldb/source/Target/CMakeLists.txt             |    5 +
 lldb/source/Target/Process.cpp                |   10 +
 lldb/source/Target/RegisterContextUnwind.cpp  |   46 +
 lldb/source/Target/ThreadPlanCallFunction.cpp |   34 +
 lldb/source/Target/UnwindLLDB.cpp             |   15 +
 lldb/source/Utility/ArchSpec.cpp              |   18 +-
 .../Utility/StringExtractorGDBRemote.cpp      |    2 +
 lldb/test/CMakeLists.txt                      |    2 +-
 lldb/test/Shell/Expr/TestIRMemoryMap.test     |    2 +-
 lldb/test/Shell/Process/TestEnvironment.test  |    2 +-
 lldb/tools/driver/CMakeLists.txt              |    5 +
 lldb/tools/driver/Driver.cpp                  |    5 +-
 lldb/tools/lldb-dap/CMakeLists.txt            |    4 +
 lldb/tools/lldb-server/CMakeLists.txt         |    7 +
 .../lldb-server/SystemInitializerLLGS.cpp     |   15 +
 lldb/tools/lldb-server/lldb-gdbserver.cpp     |    4 +
 lldb/unittests/Host/FileSystemTest.cpp        |    2 +-
 lldb/unittests/Host/posix/TerminalTest.cpp    |    4 +
 llvm/include/llvm/Object/XCOFFObjectFile.h    |    4 +-
 llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp        |   15 +-
 129 files changed, 8950 insertions(+), 76 deletions(-)
 create mode 100644 lldb/include/lldb/Host/aix/AbstractSocket.h
 create mode 100644 lldb/include/lldb/Host/aix/Host.h
 create mode 100644 lldb/include/lldb/Host/aix/HostInfoAIX.h
 create mode 100644 lldb/include/lldb/Host/aix/Ptrace.h
 create mode 100644 lldb/include/lldb/Host/aix/Support.h
 create mode 100644 lldb/include/lldb/Host/aix/Uio.h
 create mode 100644 lldb/source/Host/aix/AbstractSocket.cpp
 create mode 100644 lldb/source/Host/aix/Host.cpp
 create mode 100644 lldb/source/Host/aix/HostInfoAIX.cpp
 create mode 100644 lldb/source/Host/aix/Support.cpp
 create mode 100644 lldb/source/Host/common/LICENSE.aix-netbsd.txt
 create mode 100644 lldb/source/Plugins/DynamicLoader/AIX-DYLD/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
 create mode 100644 lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
 create mode 100644 lldb/source/Plugins/ObjectContainer/Big-Archive/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
 create mode 100644 lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.h
 create mode 100644 lldb/source/Plugins/ObjectFile/XCOFF/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
 create mode 100644 lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
 create mode 100644 lldb/source/Plugins/Platform/AIX/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
 create mode 100644 lldb/source/Plugins/Platform/AIX/PlatformAIX.h
 create mode 100644 lldb/source/Plugins/Process/AIX/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeProcessAIX.h
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.h
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
 create mode 100644 lldb/source/Plugins/Process/AIX/NativeThreadAIX.h

diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 30f3911a8b03c..fc91981db68c1 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -5052,10 +5052,14 @@ std::string CGObjCCommonMac::GetSectionName(StringRef Section,
   case llvm::Triple::COFF:
     assert(Section.starts_with("__") && "expected the name to begin with __");
     return ("." + Section.substr(2) + "$B").str();
+  case llvm::Triple::XCOFF:
+    // Hack to allow "p 10+1" on AIX for lldb
+    assert(Section.substr(0, 2) == "__" &&
+           "expected the name to begin with __");
+    return Section.substr(2).str();
   case llvm::Triple::Wasm:
   case llvm::Triple::GOFF:
   case llvm::Triple::SPIRV:
-  case llvm::Triple::XCOFF:
   case llvm::Triple::DXContainer:
     llvm::report_fatal_error(
         "Objective-C support is unimplemented for object file format");
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 59cdc4593463c..2e9ae0d0b3221 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -38,6 +38,10 @@ endif()
 include(LLDBConfig)
 include(AddLLDB)
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  add_definitions("-D__AIX__")
+endif()
+
 # Define the LLDB_CONFIGURATION_xxx matching the build type.
 if(uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   add_definitions(-DLLDB_CONFIGURATION_DEBUG)
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index a60921990cf77..a0f118a11984c 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -299,7 +299,7 @@ endif()
 
 # Figure out if lldb could use lldb-server.  If so, then we'll
 # ensure we build lldb-server when an lldb target is being built.
-if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|Windows")
+if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|Windows|AIX")
   set(LLDB_CAN_USE_LLDB_SERVER ON)
 else()
   set(LLDB_CAN_USE_LLDB_SERVER OFF)
diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h
index 5589c1c9a350d..3829386562795 100644
--- a/lldb/include/lldb/Core/Module.h
+++ b/lldb/include/lldb/Core/Module.h
@@ -196,6 +196,9 @@ class Module : public std::enable_shared_from_this<Module>,
   bool SetLoadAddress(Target &target, lldb::addr_t value, bool value_is_offset,
                       bool &changed);
 
+  bool SetLoadAddressByType(Target &target, lldb::addr_t value,
+                            bool value_is_offset, bool &changed, int type_id);
+
   /// \copydoc SymbolContextScope::CalculateSymbolContext(SymbolContext*)
   ///
   /// \see SymbolContextScope
diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 4cbbbfa8a26e1..4fe06412b6b0b 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -21,6 +21,7 @@
 
 #include <mutex>
 #include <vector>
+#include <string.h>
 
 namespace lldb_private {
 
@@ -41,8 +42,26 @@ class ModuleSpec {
   }
 
   ModuleSpec(const FileSpec &file_spec, const ArchSpec &arch)
-      : m_file(file_spec), m_arch(arch), m_object_offset(0),
-        m_object_size(FileSystem::Instance().GetByteSize(file_spec)) {}
+      : m_arch(arch), m_object_offset(0) {
+    // parse object inside module format for example: /usr/ccs/lib/libc.a(shr_64.o)
+    llvm::SmallString<256> path_with_object;
+    file_spec.GetPath(path_with_object);
+    if (strstr(path_with_object.c_str(), "(") != nullptr) {
+      char *part;
+      char *str = (char *)path_with_object.c_str();
+      part = strtok(str, "()");
+      assert(part);
+      llvm::StringRef file_name(part);
+      part = strtok(nullptr, "()");
+      assert(part);
+      m_object_name = ConstString(part);
+      m_file = FileSpec(file_name);
+      m_object_size = FileSystem::Instance().GetByteSize(m_file);
+    } else {
+      m_file = file_spec;
+      m_object_size = FileSystem::Instance().GetByteSize(file_spec);
+    }
+  }
 
   FileSpec *GetFileSpecPtr() { return (m_file ? &m_file : nullptr); }
 
diff --git a/lldb/include/lldb/Host/HostGetOpt.h b/lldb/include/lldb/Host/HostGetOpt.h
index 52cfdf4dbb89c..f450e561d6afb 100644
--- a/lldb/include/lldb/Host/HostGetOpt.h
+++ b/lldb/include/lldb/Host/HostGetOpt.h
@@ -9,7 +9,7 @@
 #ifndef LLDB_HOST_HOSTGETOPT_H
 #define LLDB_HOST_HOSTGETOPT_H
 
-#if !defined(_MSC_VER) && !defined(__NetBSD__)
+#if !defined(_MSC_VER) && !defined(__NetBSD__) && !defined(__AIX__)
 
 #include <getopt.h>
 #include <unistd.h>
diff --git a/lldb/include/lldb/Host/HostInfo.h b/lldb/include/lldb/Host/HostInfo.h
index b7010d69d88e7..156df8cf6901d 100644
--- a/lldb/include/lldb/Host/HostInfo.h
+++ b/lldb/include/lldb/Host/HostInfo.h
@@ -55,6 +55,9 @@
 #elif defined(__APPLE__)
 #include "lldb/Host/macosx/HostInfoMacOSX.h"
 #define HOST_INFO_TYPE HostInfoMacOSX
+#elif defined(__AIX__)
+#include "lldb/Host/aix/HostInfoAIX.h"
+#define HOST_INFO_TYPE HostInfoAIX
 #else
 #include "lldb/Host/posix/HostInfoPosix.h"
 #define HOST_INFO_TYPE HostInfoPosix
diff --git a/lldb/include/lldb/Host/HostInfoBase.h b/lldb/include/lldb/Host/HostInfoBase.h
index 705aad559f3b7..29e6acf39bfb2 100644
--- a/lldb/include/lldb/Host/HostInfoBase.h
+++ b/lldb/include/lldb/Host/HostInfoBase.h
@@ -149,6 +149,7 @@ class HostInfoBase {
     return {};
   }
 
+  static bool ComputeSharedLibraryDirectory(FileSpec &file_spec);
   /// Returns the distribution id of the host
   ///
   /// This will be something like "ubuntu", "fedora", etc. on Linux.
@@ -158,7 +159,6 @@ class HostInfoBase {
   static llvm::StringRef GetDistributionId() { return llvm::StringRef(); }
 
 protected:
-  static bool ComputeSharedLibraryDirectory(FileSpec &file_spec);
   static bool ComputeSupportExeDirectory(FileSpec &file_spec);
   static bool ComputeProcessTempFileDirectory(FileSpec &file_spec);
   static bool ComputeGlobalTempFileDirectory(FileSpec &file_spec);
diff --git a/lldb/include/lldb/Host/XML.h b/lldb/include/lldb/Host/XML.h
index da0f9cd7aa8c0..cf359f7726d5d 100644
--- a/lldb/include/lldb/Host/XML.h
+++ b/lldb/include/lldb/Host/XML.h
@@ -11,6 +11,11 @@
 
 #include "lldb/Host/Config.h"
 
+#if defined(__AIX__)
+//FIXME for AIX
+#undef LLDB_ENABLE_LIBXML2
+#endif
+
 #if LLDB_ENABLE_LIBXML2
 #include <libxml/xmlreader.h>
 #endif
diff --git a/lldb/include/lldb/Host/aix/AbstractSocket.h b/lldb/include/lldb/Host/aix/AbstractSocket.h
new file mode 100644
index 0000000000000..78a567a6b9095
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/AbstractSocket.h
@@ -0,0 +1,25 @@
+//===-- AbstractSocket.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef liblldb_AbstractSocket_h_
+#define liblldb_AbstractSocket_h_
+
+#include "lldb/Host/posix/DomainSocket.h"
+
+namespace lldb_private {
+class AbstractSocket : public DomainSocket {
+public:
+  AbstractSocket(bool child_processes_inherit);
+
+protected:
+  size_t GetNameOffset() const override;
+  void DeleteSocketFile(llvm::StringRef name) override;
+};
+}
+
+#endif // ifndef liblldb_AbstractSocket_h_
diff --git a/lldb/include/lldb/Host/aix/Host.h b/lldb/include/lldb/Host/aix/Host.h
new file mode 100644
index 0000000000000..1e3487752995f
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/Host.h
@@ -0,0 +1,22 @@
+//===-- Host.h --------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_AIX_HOST_H
+#define LLDB_HOST_AIX_HOST_H
+
+#include "lldb/lldb-types.h"
+#include <optional>
+
+namespace lldb_private {
+
+// Get PID (i.e. the primary thread ID) corresponding to the specified TID.
+std::optional<lldb::pid_t> getPIDForTID(lldb::pid_t tid);
+
+} // namespace lldb_private
+
+#endif // #ifndef LLDB_HOST_AIX_HOST_H
diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
new file mode 100644
index 0000000000000..ced4cf34d38a8
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -0,0 +1,42 @@
+//===-- HostInfoAIX.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef lldb_Host_aix_HostInfoAIX_h_
+#define lldb_Host_aix_HostInfoAIX_h_
+
+#include "lldb/Host/posix/HostInfoPosix.h"
+#include "lldb/Utility/FileSpec.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/VersionTuple.h"
+
+#include <string>
+
+namespace lldb_private {
+
+class HostInfoAIX : public HostInfoPosix {
+  friend class HostInfoBase;
+
+public:
+  static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
+  static void Terminate();
+
+  static llvm::VersionTuple GetOSVersion();
+  static std::optional<std::string> GetOSBuildString();
+  static llvm::StringRef GetDistributionId();
+  static FileSpec GetProgramFileSpec();
+
+protected:
+  static bool ComputeSupportExeDirectory(FileSpec &file_spec);
+  static bool ComputeSystemPluginsDirectory(FileSpec &file_spec);
+  static bool ComputeUserPluginsDirectory(FileSpec &file_spec);
+  static void ComputeHostArchitectureSupport(ArchSpec &arch_32,
+                                             ArchSpec &arch_64);
+};
+}
+
+#endif
diff --git a/lldb/include/lldb/Host/aix/Ptrace.h b/lldb/include/lldb/Host/aix/Ptrace.h
new file mode 100644
index 0000000000000..88928f18102d7
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/Ptrace.h
@@ -0,0 +1,62 @@
+//===-- Ptrace.h ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This file defines ptrace functions & structures
+
+#ifndef liblldb_Host_aix_Ptrace_h_
+#define liblldb_Host_aix_Ptrace_h_
+
+#include <sys/ptrace.h>
+
+#define DEBUG_PTRACE_MAXBYTES 20
+
+// Support ptrace extensions even when compiled without required kernel support
+#ifndef PTRACE_GETREGS
+#define PTRACE_GETREGS          (PT_COMMAND_MAX+1)
+#endif
+#ifndef PTRACE_SETREGS
+#define PTRACE_SETREGS          (PT_COMMAND_MAX+2)
+#endif
+#ifndef PTRACE_GETFPREGS
+#define PTRACE_GETFPREGS        (PT_COMMAND_MAX+3)
+#endif
+#ifndef PTRACE_SETFPREGS
+#define PTRACE_SETFPREGS        (PT_COMMAND_MAX+4)
+#endif
+#ifndef PTRACE_GETREGSET
+#define PTRACE_GETREGSET 0x4204
+#endif
+#ifndef PTRACE_SETREGSET
+#define PTRACE_SETREGSET 0x4205
+#endif
+#ifndef PTRACE_GET_THREAD_AREA
+#define PTRACE_GET_THREAD_AREA  (PT_COMMAND_MAX+5)
+#endif
+#ifndef PTRACE_ARCH_PRCTL
+#define PTRACE_ARCH_PRCTL       (PT_COMMAND_MAX+6)
+#endif
+#ifndef ARCH_GET_FS
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+#endif
+#ifndef PTRACE_PEEKMTETAGS
+#define PTRACE_PEEKMTETAGS      (PT_COMMAND_MAX+7)
+#endif
+#ifndef PTRACE_POKEMTETAGS
+#define PTRACE_POKEMTETAGS      (PT_COMMAND_MAX+8)
+#endif
+#ifndef PTRACE_GETVRREGS
+#define PTRACE_GETVRREGS        (PT_COMMAND_MAX+9)
+#endif
+#ifndef PTRACE_GETVSRREGS
+#define PTRACE_GETVSRREGS       (PT_COMMAND_MAX+10)
+#endif
+
+#endif // liblldb_Host_aix_Ptrace_h_
diff --git a/lldb/include/lldb/Host/aix/Support.h b/lldb/include/lldb/Host/aix/Support.h
new file mode 100644
index 0000000000000..27d6c2b50a35b
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/Support.h
@@ -0,0 +1,29 @@
+//===-- Support.h -----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_AIX_SUPPORT_H
+#define LLDB_HOST_AIX_SUPPORT_H
+
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace lldb_private {
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+getProcFile(::pid_t pid, ::pid_t tid, const llvm::Twine &file);
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+getProcFile(::pid_t pid, const llvm::Twine &file);
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+getProcFile(const llvm::Twine &file);
+
+} // namespace lldb_private
+
+#endif // #ifndef LLDB_HOST_AIX_SUPPORT_H
diff --git a/lldb/include/lldb/Host/aix/Uio.h b/lldb/include/lldb/Host/aix/Uio.h
new file mode 100644
index 0000000000000..acf79ecc6a1d0
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/Uio.h
@@ -0,0 +1,23 @@
+//===-- Uio.h ---------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef liblldb_Host_aix_Uio_h_
+#define liblldb_Host_aix_Uio_h_
+
+#include "lldb/Host/Config.h"
+#include <sys/uio.h>
+
+// We shall provide our own implementation of process_vm_readv if it is not
+// present
+#if !HAVE_PROCESS_VM_READV
+ssize_t process_vm_readv(::pid_t pid, const struct iovec *local_iov,
+                         unsigned long liovcnt, const struct iovec *remote_iov,
+                         unsigned long riovcnt, unsigned long flags);
+#endif
+
+#endif // liblldb_Host_aix_Uio_h_
diff --git a/lldb/include/lldb/Host/common/GetOptInc.h b/lldb/include/lldb/Host/common/GetOptInc.h
index 3fb9add479541..ebb475bfaf6b8 100644
--- a/lldb/include/lldb/Host/common/GetOptInc.h
+++ b/lldb/include/lldb/Host/common/GetOptInc.h
@@ -11,11 +11,11 @@
 
 #include "lldb/lldb-defines.h"
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__AIX__)
 #define REPLACE_GETOPT
 #define REPLACE_GETOPT_LONG
 #endif
-#if defined(_MSC_VER) || defined(__NetBSD__)
+#if defined(_MSC_VER) || defined(__NetBSD__) || defined(__AIX__)
 #define REPLACE_GETOPT_LONG_ONLY
 #endif
 
@@ -35,7 +35,7 @@ struct option {
   int val;
 };
 
-int getopt(int argc, char *const argv[], const char *optstring);
+int getopt(int argc, char *const argv[], const char *optstring) throw();
 
 // from getopt.h
 extern char *optarg;
diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 8592323322e38..bf66ccec263d2 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -401,6 +401,11 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>,
     return false;
   }
 
+  virtual bool SetLoadAddressByType(Target &target, lldb::addr_t value,
+                              bool value_is_offset, int type_id) {
+    return false;
+  }
+
   /// Gets whether endian swapping should occur when extracting data from this
   /// object file.
   ///
diff --git a/lldb/include/lldb/Target/ABI.h b/lldb/include/lldb/Target/ABI.h
index 7b646d743346b..281a89951ef88 100644
--- a/lldb/include/lldb/Target/ABI.h
+++ b/lldb/include/lldb/Target/ABI.h
@@ -47,6 +47,12 @@ class ABI : public PluginInterface {
                                   lldb::addr_t returnAddress,
                                   llvm::ArrayRef<lldb::addr_t> args) const = 0;
 
+  virtual bool PrepareTrivialCall(lldb_private::Thread &thread, lldb::addr_t sp,
+                                  lldb::addr_t functionAddress,
+                                  lldb::addr_t tocAddress,
+                                  lldb::addr_t returnAddress,
+                                  llvm::ArrayRef<lldb::addr_t> args) const;
+
   // Prepare trivial call used from ThreadPlanFunctionCallUsingABI
   // AD:
   //  . Because i don't want to change other ABI's this is not declared pure
diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h
index 0629e2faae7e9..7dccd317c2dca 100644
--- a/lldb/include/lldb/Target/DynamicLoader.h
+++ b/lldb/include/lldb/Target/DynamicLoader.h
@@ -359,6 +359,12 @@ class DynamicLoader : public PluginInterface {
                                     lldb::addr_t base_addr,
                                     bool base_addr_is_offset);
 
+  virtual void UpdateLoadedSectionsByType(lldb::ModuleSP module,
+                                    lldb::addr_t link_map_addr,
+                                    lldb::addr_t base_addr,
+                                    bool base_addr_is_offset,
+                                    int type_id);
+
   // Utility method so base classes can share implementation of
   // UpdateLoadedSections
   void UpdateLoadedSectionsCommon(lldb::ModuleSP module, lldb::addr_t base_addr,
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index cf16fbc812aa4..886ca766112c8 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -63,6 +63,10 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/VersionTuple.h"
 
+#if defined(__AIX__)
+struct ld_xinfo;
+#endif
+
 namespace lldb_private {
 
 template <typename B, typename S> struct Range;
@@ -1915,6 +1919,10 @@ class Process : public std::enable_shared_from_this<Process>,
   Status GetMemoryRegionInfo(lldb::addr_t load_addr,
                              MemoryRegionInfo &range_info);
 
+#if defined(__AIX__)
+  Status GetLDXINFO(struct ld_xinfo *info_ptr);
+#endif
+
   /// Obtain all the mapped memory regions within this process.
   ///
   /// \param[out] region_list
@@ -2855,6 +2863,12 @@ void PruneThreadPlans();
     return Status("Process::DoGetMemoryRegionInfo() not supported");
   }
 
+#if defined(__AIX__)
+  virtual Status DoGetLDXINFO(struct ld_xinfo *info_ptr) {
+    return Status("Process::DoGetLDXINFO() not supported");
+  }
+#endif
+
   /// Provide an override value in the subclass for lldb's
   /// CPU-based logic for whether watchpoint exceptions are
   /// received before or after an instruction executes.
diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h
index ef8ae88403866..00a95853800ed 100644
--- a/lldb/include/lldb/Target/RegisterContextUnwind.h
+++ b/lldb/include/lldb/Target/RegisterContextUnwind.h
@@ -67,6 +67,10 @@ class RegisterContextUnwind : public lldb_private::RegisterContext {
 
   bool ReadPC(lldb::addr_t &start_pc);
 
+#ifdef __AIX__
+  bool ReadLR(lldb::addr_t &lr);
+#endif
+
   // Indicates whether this frame *behaves* like frame zero -- the currently
   // executing frame -- or not.  This can be true in the middle of the stack
   // above asynchronous trap handlers (sigtramp) for instance.
diff --git a/lldb/include/lldb/Target/ThreadPlanCallFunction.h b/lldb/include/lldb/Target/ThreadPlanCallFunction.h
index cb6e7caebb4ad..7880db1592e04 100644
--- a/lldb/include/lldb/Target/ThreadPlanCallFunction.h
+++ b/lldb/include/lldb/Target/ThreadPlanCallFunction.h
@@ -27,6 +27,12 @@ class ThreadPlanCallFunction : public ThreadPlan {
                          llvm::ArrayRef<lldb::addr_t> args,
                          const EvaluateExpressionOptions &options);
 
+  ThreadPlanCallFunction(Thread &thread, const Address &function,
+                         const Address &toc,
+                         const CompilerType &return_type,
+                         llvm::ArrayRef<lldb::addr_t> args,
+                         const EvaluateExpressionOptions &options);
+
   ThreadPlanCallFunction(Thread &thread, const Address &function,
                          const EvaluateExpressionOptions &options);
 
diff --git a/lldb/include/lldb/Utility/StringExtractorGDBRemote.h b/lldb/include/lldb/Utility/StringExtractorGDBRemote.h
index dd468ef5bddef..9953bd6c24588 100644
--- a/lldb/include/lldb/Utility/StringExtractorGDBRemote.h
+++ b/lldb/include/lldb/Utility/StringExtractorGDBRemote.h
@@ -61,6 +61,7 @@ class StringExtractorGDBRemote : public StringExtractor {
     eServerPacketType_qQueryGDBServer,
     eServerPacketType_qKillSpawnedProcess,
     eServerPacketType_qLaunchSuccess,
+    eServerPacketType_qLDXINFO,
     eServerPacketType_qModuleInfo,
     eServerPacketType_qProcessInfoPID,
     eServerPacketType_qSpeedTest,
diff --git a/lldb/include/lldb/lldb-private-enumerations.h b/lldb/include/lldb/lldb-private-enumerations.h
index c24a3538f58da..98c1e956bf8f7 100644
--- a/lldb/include/lldb/lldb-private-enumerations.h
+++ b/lldb/include/lldb/lldb-private-enumerations.h
@@ -65,6 +65,7 @@ enum ArchitectureType {
   eArchTypeMachO,
   eArchTypeELF,
   eArchTypeCOFF,
+  eArchTypeXCOFF,
   kNumArchTypes
 };
 
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index a32bc58507d8e..3ecdb11daef7d 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -40,6 +40,113 @@ add_custom_target(lldb-sbapi-dwarf-enums
   DEPENDS ${sb_languages_file})
 set_target_properties(lldb-sbapi-dwarf-enums PROPERTIES FOLDER "LLDB/Tablegenning")
 
+if(CMAKE_SYSTEM_NAME MATCHES "AIX")
+add_lldb_library(liblldb STATIC ${option_framework}
+  SBAddress.cpp
+  SBAddressRange.cpp
+  SBAddressRangeList.cpp
+  SBAttachInfo.cpp
+  SBBlock.cpp
+  SBBreakpoint.cpp
+  SBBreakpointLocation.cpp
+  SBBreakpointName.cpp
+  SBBreakpointOptionCommon.cpp
+  SBBroadcaster.cpp
+  SBCommandInterpreter.cpp
+  SBCommandInterpreterRunOptions.cpp
+  SBCommandReturnObject.cpp
+  SBCommunication.cpp
+  SBCompileUnit.cpp
+  SBSaveCoreOptions.cpp
+  SBData.cpp
+  SBDebugger.cpp
+  SBDeclaration.cpp
+  SBEnvironment.cpp
+  SBError.cpp
+  SBEvent.cpp
+  SBExecutionContext.cpp
+  SBExpressionOptions.cpp
+  SBFileSpec.cpp
+  SBFile.cpp
+  SBFileSpecList.cpp
+  SBFormat.cpp
+  SBFrame.cpp
+  SBFunction.cpp
+  SBHostOS.cpp
+  SBInstruction.cpp
+  SBInstructionList.cpp
+  SBLanguageRuntime.cpp
+  SBLaunchInfo.cpp
+  SBLineEntry.cpp
+  SBListener.cpp
+  SBMemoryRegionInfo.cpp
+  SBMemoryRegionInfoList.cpp
+  SBModule.cpp
+  SBModuleSpec.cpp
+  SBPlatform.cpp
+  SBProcess.cpp
+  SBProcessInfo.cpp
+  SBProcessInfoList.cpp
+  SBQueue.cpp
+  SBQueueItem.cpp
+  SBReproducer.cpp
+  SBScriptObject.cpp
+  SBSection.cpp
+  SBSourceManager.cpp
+  SBStatisticsOptions.cpp
+  SBStream.cpp
+  SBStringList.cpp
+  SBStructuredData.cpp
+  SBSymbol.cpp
+  SBSymbolContext.cpp
+  SBSymbolContextList.cpp
+  SBTarget.cpp
+  SBThread.cpp
+  SBThreadCollection.cpp
+  SBThreadPlan.cpp
+  SBTrace.cpp
+  SBTraceCursor.cpp
+  SBType.cpp
+  SBTypeCategory.cpp
+  SBTypeEnumMember.cpp
+  SBTypeFilter.cpp
+  SBTypeFormat.cpp
+  SBTypeNameSpecifier.cpp
+  SBTypeSummary.cpp
+  SBTypeSynthetic.cpp
+  SBValue.cpp
+  SBValueList.cpp
+  SBVariablesOptions.cpp
+  SBWatchpoint.cpp
+  SBWatchpointOptions.cpp
+  SBUnixSignals.cpp
+  SystemInitializerFull.cpp
+  ${lldb_python_wrapper}
+  ${lldb_lua_wrapper}
+
+  DEPENDS
+    lldb-sbapi-dwarf-enums
+
+  LINK_LIBS
+    lldbBreakpoint
+    lldbCore
+    lldbDataFormatters
+    lldbExpression
+    lldbHost
+    lldbInitialization
+    lldbInterpreter
+    lldbSymbol
+    lldbTarget
+    lldbUtility
+    lldbVersion
+    ${LLDB_ALL_PLUGINS}
+  LINK_COMPONENTS
+    Support
+
+  ${option_install_prefix}
+)
+
+else()
 add_lldb_library(liblldb SHARED ${option_framework}
   SBAddress.cpp
   SBAddressRange.cpp
@@ -144,6 +251,7 @@ add_lldb_library(liblldb SHARED ${option_framework}
 
   ${option_install_prefix}
 )
+endif()
 
 # lib/pythonX.Y/dist-packages/lldb/_lldb.so is a symlink to lib/liblldb.so,
 # which depends on lib/libLLVM*.so (BUILD_SHARED_LIBS) or lib/libLLVM-10git.so
diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp
index 3d908047f9455..728fe04d14d92 100644
--- a/lldb/source/API/SBBreakpoint.cpp
+++ b/lldb/source/API/SBBreakpoint.cpp
@@ -342,7 +342,7 @@ uint32_t SBBreakpoint::GetIgnoreCount() const {
   return count;
 }
 
-void SBBreakpoint::SetThreadID(tid_t tid) {
+void SBBreakpoint::SetThreadID(lldb::tid_t tid) {
   LLDB_INSTRUMENT_VA(this, tid);
 
   BreakpointSP bkpt_sp = GetSP();
@@ -353,10 +353,10 @@ void SBBreakpoint::SetThreadID(tid_t tid) {
   }
 }
 
-tid_t SBBreakpoint::GetThreadID() {
+lldb::tid_t SBBreakpoint::GetThreadID() {
   LLDB_INSTRUMENT_VA(this);
 
-  tid_t tid = LLDB_INVALID_THREAD_ID;
+  lldb::tid_t tid = LLDB_INVALID_THREAD_ID;
   BreakpointSP bkpt_sp = GetSP();
   if (bkpt_sp) {
     std::lock_guard<std::recursive_mutex> guard(
diff --git a/lldb/source/API/SBBreakpointLocation.cpp b/lldb/source/API/SBBreakpointLocation.cpp
index 75b66364d4f1a..fad9a4076a54f 100644
--- a/lldb/source/API/SBBreakpointLocation.cpp
+++ b/lldb/source/API/SBBreakpointLocation.cpp
@@ -302,7 +302,7 @@ bool SBBreakpointLocation::GetCommandLineCommands(SBStringList &commands) {
   return has_commands;
 }
 
-void SBBreakpointLocation::SetThreadID(tid_t thread_id) {
+void SBBreakpointLocation::SetThreadID(lldb::tid_t thread_id) {
   LLDB_INSTRUMENT_VA(this, thread_id);
 
   BreakpointLocationSP loc_sp = GetSP();
@@ -313,10 +313,10 @@ void SBBreakpointLocation::SetThreadID(tid_t thread_id) {
   }
 }
 
-tid_t SBBreakpointLocation::GetThreadID() {
+lldb::tid_t SBBreakpointLocation::GetThreadID() {
   LLDB_INSTRUMENT_VA(this);
 
-  tid_t tid = LLDB_INVALID_THREAD_ID;
+  lldb::tid_t tid = LLDB_INVALID_THREAD_ID;
   BreakpointLocationSP loc_sp = GetSP();
   if (loc_sp) {
     std::lock_guard<std::recursive_mutex> guard(
diff --git a/lldb/source/API/SBBreakpointName.cpp b/lldb/source/API/SBBreakpointName.cpp
index 7f63aaf6fa7d5..5c7c0a8f6504b 100644
--- a/lldb/source/API/SBBreakpointName.cpp
+++ b/lldb/source/API/SBBreakpointName.cpp
@@ -347,7 +347,7 @@ bool SBBreakpointName::GetAutoContinue() {
   return bp_name->GetOptions().IsAutoContinue();
 }
 
-void SBBreakpointName::SetThreadID(tid_t tid) {
+void SBBreakpointName::SetThreadID(lldb::tid_t tid) {
   LLDB_INSTRUMENT_VA(this, tid);
 
   BreakpointName *bp_name = GetBreakpointName();
@@ -361,7 +361,7 @@ void SBBreakpointName::SetThreadID(tid_t tid) {
   UpdateName(*bp_name);
 }
 
-tid_t SBBreakpointName::GetThreadID() {
+lldb::tid_t SBBreakpointName::GetThreadID() {
   LLDB_INSTRUMENT_VA(this);
 
   BreakpointName *bp_name = GetBreakpointName();
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 7758a87403b5a..ea43a7f98b69f 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -113,6 +113,16 @@ void DynamicLoader::UpdateLoadedSections(ModuleSP module, addr_t link_map_addr,
   UpdateLoadedSectionsCommon(module, base_addr, base_addr_is_offset);
 }
 
+void DynamicLoader::UpdateLoadedSectionsByType(lldb::ModuleSP module,
+                                    lldb::addr_t link_map_addr,
+                                    lldb::addr_t base_addr,
+                                    bool base_addr_is_offset,
+                                    int type_id) {
+  bool changed;
+  module->SetLoadAddressByType(m_process->GetTarget(), base_addr, base_addr_is_offset,
+                         changed, type_id);
+}
+
 void DynamicLoader::UpdateLoadedSectionsCommon(ModuleSP module,
                                                addr_t base_addr,
                                                bool base_addr_is_offset) {
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 387c4fac6b0f8..43c5b043ef7a2 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -167,12 +167,14 @@ static char *GetItaniumDemangledStr(const char *M) {
            "Expected demangled_size to return length including trailing null");
   }
 
+#if !defined(__AIX__)  
   if (Log *log = GetLog(LLDBLog::Demangle)) {
     if (demangled_cstr)
       LLDB_LOGF(log, "demangled itanium: %s -> \"%s\"", M, demangled_cstr);
     else
       LLDB_LOGF(log, "demangled itanium: %s -> error: failed to demangle", M);
   }
+#endif
 
   return demangled_cstr;
 }
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index f9d7832254f46..044a5d29978e8 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -1510,6 +1510,18 @@ bool Module::SetLoadAddress(Target &target, lldb::addr_t value,
   return false;
 }
 
+bool Module::SetLoadAddressByType(Target &target, lldb::addr_t value,
+                            bool value_is_offset, bool &changed, int type_id) {
+  ObjectFile *object_file = GetObjectFile();
+  if (object_file != nullptr) {
+    changed = object_file->SetLoadAddressByType(target, value, value_is_offset, type_id);
+    return true;
+  } else {
+    changed = false;
+  }
+  return false;
+}
+
 bool Module::MatchesModuleSpec(const ModuleSpec &module_ref) {
   const UUID &uuid = module_ref.GetUUID();
 
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 0763e88d4608f..9ed55853930a6 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -263,6 +263,10 @@ bool Section::ResolveContainedAddress(addr_t offset, Address &so_addr,
 
 bool Section::ContainsFileAddress(addr_t vm_addr) const {
   const addr_t file_addr = GetFileAddress();
+#ifdef __AIX__
+  if (file_addr == 0)
+    return false;
+#endif
   if (file_addr != LLDB_INVALID_ADDRESS && !IsThreadSpecific()) {
     if (file_addr <= vm_addr) {
       const addr_t offset = (vm_addr - file_addr) * m_target_byte_size;
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 444e44b392891..c1feec990f989 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -130,7 +130,7 @@ static llvm::Error ReadRegisterValueAsScalar(RegisterContext *reg_ctx,
 
 /// Return the length in bytes of the set of operands for \p op. No guarantees
 /// are made on the state of \p data after this call.
-static offset_t GetOpcodeDataSize(const DataExtractor &data,
+static lldb::offset_t GetOpcodeDataSize(const DataExtractor &data,
                                   const lldb::offset_t data_offset,
                                   const uint8_t op, const DWARFUnit *dwarf_cu) {
   lldb::offset_t offset = data_offset;
@@ -358,7 +358,7 @@ lldb::addr_t DWARFExpression::GetLocation_DW_OP_addr(const DWARFUnit *dwarf_cu,
       error = true;
       break;
     }
-    const offset_t op_arg_size =
+    const lldb::offset_t op_arg_size =
         GetOpcodeDataSize(m_data, offset, op, dwarf_cu);
     if (op_arg_size == LLDB_INVALID_OFFSET) {
       error = true;
@@ -418,7 +418,7 @@ bool DWARFExpression::Update_DW_OP_addr(const DWARFUnit *dwarf_cu,
       m_data.SetData(encoder.GetDataBuffer());
       return true;
     }
-    const offset_t op_arg_size =
+    const lldb::offset_t op_arg_size =
         GetOpcodeDataSize(m_data, offset, op, dwarf_cu);
     if (op_arg_size == LLDB_INVALID_OFFSET)
       break;
@@ -435,7 +435,7 @@ bool DWARFExpression::ContainsThreadLocalStorage(
 
     if (op == DW_OP_form_tls_address || op == DW_OP_GNU_push_tls_address)
       return true;
-    const offset_t op_arg_size =
+    const lldb::offset_t op_arg_size =
         GetOpcodeDataSize(m_data, offset, op, dwarf_cu);
     if (op_arg_size == LLDB_INVALID_OFFSET)
       return false;
@@ -515,7 +515,7 @@ bool DWARFExpression::LinkThreadLocalStorage(
     }
 
     if (!decoded_data) {
-      const offset_t op_arg_size =
+      const lldb::offset_t op_arg_size =
           GetOpcodeDataSize(m_data, offset, op, dwarf_cu);
       if (op_arg_size == LLDB_INVALID_OFFSET)
         return false;
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index c2e091ee8555b..5374b16881950 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -7,6 +7,11 @@ if (APPLE AND LLVM_ENABLE_LOCAL_SUBMODULE_VISIBILITY)
   endif()
 endif()
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
+endif()
+
 macro(add_host_subdirectory group)
   list(APPEND HOST_SOURCES ${ARGN})
   source_group(${group} FILES ${ARGN})
@@ -133,6 +138,14 @@ else()
       openbsd/Host.cpp
       openbsd/HostInfoOpenBSD.cpp
       )
+
+  elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    add_host_subdirectory(aix
+      aix/AbstractSocket.cpp
+      aix/Host.cpp
+      aix/HostInfoAIX.cpp
+      aix/Support.cpp
+      )
   endif()
 endif()
 
diff --git a/lldb/source/Host/aix/AbstractSocket.cpp b/lldb/source/Host/aix/AbstractSocket.cpp
new file mode 100644
index 0000000000000..bfb67d452f7ec
--- /dev/null
+++ b/lldb/source/Host/aix/AbstractSocket.cpp
@@ -0,0 +1,21 @@
+//===-- AbstractSocket.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/aix/AbstractSocket.h"
+
+#include "llvm/ADT/StringRef.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+AbstractSocket::AbstractSocket(bool child_processes_inherit)
+    : DomainSocket(ProtocolUnixAbstract, child_processes_inherit) {}
+
+size_t AbstractSocket::GetNameOffset() const { return 1; }
+
+void AbstractSocket::DeleteSocketFile(llvm::StringRef name) {}
diff --git a/lldb/source/Host/aix/Host.cpp b/lldb/source/Host/aix/Host.cpp
new file mode 100644
index 0000000000000..d82cb9049d389
--- /dev/null
+++ b/lldb/source/Host/aix/Host.cpp
@@ -0,0 +1,304 @@
+//===-- source/Host/aix/Host.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/ProcessInfo.h"
+#include "lldb/Utility/Status.h"
+
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/Host.h"
+#include "lldb/Host/HostInfo.h"
+#include "lldb/Host/aix/Host.h"
+#include "lldb/Host/aix/Support.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "llvm/BinaryFormat/XCOFF.h"
+
+#include <sstream>
+#include <sys/procfs.h>
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+namespace {
+enum class ProcessState {
+  Unknown,
+  Dead,
+  DiskSleep,
+  Idle,
+  Paging,
+  Parked,
+  Running,
+  Sleeping,
+  TracedOrStopped,
+  Zombie,
+};
+}
+
+namespace lldb_private {
+class ProcessLaunchInfo;
+}
+
+static bool GetStatusInfo(::pid_t Pid, ProcessInstanceInfo &ProcessInfo,
+                          ProcessState &State, ::pid_t &TracerPid,
+                          ::pid_t &Tgid) {
+  Log *log = GetLog(LLDBLog::Host);
+
+  auto BufferOrError = getProcFile(Pid, "status");
+  if (!BufferOrError)
+    return false;
+
+  llvm::StringRef Rest = BufferOrError.get()->getBuffer();
+  while (!Rest.empty()) {
+    llvm::StringRef Line;
+    std::tie(Line, Rest) = Rest.split('\n');
+
+    if (Line.consume_front("Gid:")) {
+      // Real, effective, saved set, and file system GIDs. Read the first two.
+      Line = Line.ltrim();
+      uint32_t RGid, EGid;
+      Line.consumeInteger(10, RGid);
+      Line = Line.ltrim();
+      Line.consumeInteger(10, EGid);
+
+      ProcessInfo.SetGroupID(RGid);
+      ProcessInfo.SetEffectiveGroupID(EGid);
+    } else if (Line.consume_front("Uid:")) {
+      // Real, effective, saved set, and file system UIDs. Read the first two.
+      Line = Line.ltrim();
+      uint32_t RUid, EUid;
+      Line.consumeInteger(10, RUid);
+      Line = Line.ltrim();
+      Line.consumeInteger(10, EUid);
+
+      ProcessInfo.SetUserID(RUid);
+      ProcessInfo.SetEffectiveUserID(EUid);
+    } else if (Line.consume_front("PPid:")) {
+      ::pid_t PPid;
+      Line.ltrim().consumeInteger(10, PPid);
+      ProcessInfo.SetParentProcessID(PPid);
+    } else if (Line.consume_front("State:")) {
+      State = llvm::StringSwitch<ProcessState>(Line.ltrim().take_front(1))
+                  .Case("D", ProcessState::DiskSleep)
+                  .Case("I", ProcessState::Idle)
+                  .Case("R", ProcessState::Running)
+                  .Case("S", ProcessState::Sleeping)
+                  .CaseLower("T", ProcessState::TracedOrStopped)
+                  .Case("W", ProcessState::Paging)
+                  .Case("P", ProcessState::Parked)
+                  .Case("X", ProcessState::Dead)
+                  .Case("Z", ProcessState::Zombie)
+                  .Default(ProcessState::Unknown);
+      if (State == ProcessState::Unknown) {
+        LLDB_LOG(log, "Unknown process state {0}", Line);
+      }
+    } else if (Line.consume_front("TracerPid:")) {
+      Line = Line.ltrim();
+      Line.consumeInteger(10, TracerPid);
+    } else if (Line.consume_front("Tgid:")) {
+      Line = Line.ltrim();
+      Line.consumeInteger(10, Tgid);
+    }
+  }
+  return true;
+}
+
+static bool IsDirNumeric(const char *dname) {
+  for (; *dname; dname++) {
+    if (!isdigit(*dname))
+      return false;
+  }
+  return true;
+}
+
+static void GetProcessArgs(::pid_t pid, ProcessInstanceInfo &process_info) {
+  auto BufferOrError = getProcFile(pid, "cmdline");
+  if (!BufferOrError)
+    return;
+  std::unique_ptr<llvm::MemoryBuffer> Cmdline = std::move(*BufferOrError);
+
+  llvm::StringRef Arg0, Rest;
+  std::tie(Arg0, Rest) = Cmdline->getBuffer().split('\0');
+  process_info.SetArg0(Arg0);
+  while (!Rest.empty()) {
+    llvm::StringRef Arg;
+    std::tie(Arg, Rest) = Rest.split('\0');
+    process_info.GetArguments().AppendArgument(Arg);
+  }
+}
+
+static void GetExePathAndArch(::pid_t pid, ProcessInstanceInfo &process_info) {
+  Log *log = GetLog(LLDBLog::Process);
+  std::string ExePath(PATH_MAX, '\0');
+  std::string Basename(PATH_MAX, '\0');
+  struct psinfo psinfoData;
+
+  // We can't use getProcFile here because proc/[pid]/exe is a symbolic link.
+  llvm::SmallString<64> ProcExe;
+  (llvm::Twine("/proc/") + llvm::Twine(pid) + "/cwd").toVector(ProcExe);
+
+  ssize_t len = readlink(ProcExe.c_str(), &ExePath[0], PATH_MAX);
+  if (len > 0) {
+    ExePath.resize(len);
+
+    //FIXME: hack to get basename
+    struct stat statData;
+
+    std::ostringstream oss;
+
+    oss << "/proc/" << std::dec << pid << "/psinfo";
+    assert(stat(oss.str().c_str(), &statData) == 0);
+
+    const int fd = open(oss.str().c_str(), O_RDONLY);
+    assert (fd >= 0);
+
+    ssize_t readNum = read(fd, &psinfoData, sizeof(psinfoData));
+    assert (readNum >= 0);
+
+    close (fd);
+  } else {
+    LLDB_LOG(log, "failed to read link exe link for {0}: {1}", pid,
+             Status(errno, eErrorTypePOSIX));
+    ExePath.resize(0);
+  }
+
+  llvm::StringRef PathRef = std::string(&(psinfoData.pr_psargs[0]));
+
+  if (!PathRef.empty()) {
+    process_info.GetExecutableFile().SetFile(PathRef, FileSpec::Style::native);
+    ArchSpec arch_spec = ArchSpec();
+    arch_spec.SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
+    process_info.SetArchitecture(arch_spec);
+  }
+}
+
+static void GetProcessEnviron(::pid_t pid, ProcessInstanceInfo &process_info) {
+  // Get the process environment.
+  auto BufferOrError = getProcFile(pid, "environ");
+  if (!BufferOrError)
+    return;
+
+  std::unique_ptr<llvm::MemoryBuffer> Environ = std::move(*BufferOrError);
+  llvm::StringRef Rest = Environ->getBuffer();
+  while (!Rest.empty()) {
+    llvm::StringRef Var;
+    std::tie(Var, Rest) = Rest.split('\0');
+    process_info.GetEnvironment().insert(Var);
+  }
+}
+
+static bool GetProcessAndStatInfo(::pid_t pid,
+                                  ProcessInstanceInfo &process_info,
+                                  ProcessState &State, ::pid_t &tracerpid) {
+  ::pid_t tgid;
+  tracerpid = 0;
+  process_info.Clear();
+
+  process_info.SetProcessID(pid);
+
+  GetExePathAndArch(pid, process_info);
+  GetProcessArgs(pid, process_info);
+  GetProcessEnviron(pid, process_info);
+
+  // Get User and Group IDs and get tracer pid.
+  if (!GetStatusInfo(pid, process_info, State, tracerpid, tgid))
+    return false;
+
+  return true;
+}
+
+uint32_t Host::FindProcessesImpl(const ProcessInstanceInfoMatch &match_info,
+                                 ProcessInstanceInfoList &process_infos) {
+  static const char procdir[] = "/proc/";
+
+  DIR *dirproc = opendir(procdir);
+  if (dirproc) {
+    struct dirent *direntry = nullptr;
+    const uid_t our_uid = getuid();
+    const lldb::pid_t our_pid = getpid();
+    bool all_users = match_info.GetMatchAllUsers();
+
+    while ((direntry = readdir(dirproc)) != nullptr) {
+      /*
+      if (direntry->d_type != DT_DIR || !IsDirNumeric(direntry->d_name))
+        continue;
+      */
+
+      lldb::pid_t pid = atoi(direntry->d_name);
+
+      // Skip this process.
+      if (pid == our_pid)
+        continue;
+
+      ::pid_t tracerpid;
+      ProcessState State;
+      ProcessInstanceInfo process_info;
+
+      if (!GetProcessAndStatInfo(pid, process_info, State, tracerpid))
+        continue;
+
+      // Skip if process is being debugged.
+      if (tracerpid != 0)
+        continue;
+
+      if (State == ProcessState::Zombie)
+        continue;
+
+      // Check for user match if we're not matching all users and not running
+      // as root.
+      if (!all_users && (our_uid != 0) && (process_info.GetUserID() != our_uid))
+        continue;
+
+      if (match_info.Matches(process_info)) {
+        process_infos.push_back(process_info);
+      }
+    }
+
+    closedir(dirproc);
+  }
+
+  return process_infos.size();
+}
+
+bool Host::GetProcessInfo(lldb::pid_t pid, ProcessInstanceInfo &process_info) {
+  ::pid_t tracerpid;
+  ProcessState State;
+  return GetProcessAndStatInfo(pid, process_info, State, tracerpid);
+}
+
+Environment Host::GetEnvironment() { return Environment(environ); }
+
+Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
+  return Status("unimplemented");
+}
+
+std::optional<lldb::pid_t> lldb_private::getPIDForTID(lldb::pid_t tid) {
+  ::pid_t tracerpid, tgid = LLDB_INVALID_PROCESS_ID;
+  ProcessInstanceInfo process_info;
+  ProcessState state;
+
+  if (!GetStatusInfo(tid, process_info, state, tracerpid, tgid) ||
+      tgid == LLDB_INVALID_PROCESS_ID)
+    return std::nullopt;
+  return tgid;
+}
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
new file mode 100644
index 0000000000000..8bda09e01741b
--- /dev/null
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -0,0 +1,215 @@
+//===-- HostInfoAIX.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/aix/HostInfoAIX.h"
+#include "lldb/Host/Config.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+
+#include "llvm/Support/Threading.h"
+
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <mutex>
+
+using namespace lldb_private;
+
+namespace {
+struct HostInfoAIXFields {
+  llvm::once_flag m_distribution_once_flag;
+  std::string m_distribution_id;
+  llvm::once_flag m_os_version_once_flag;
+  llvm::VersionTuple m_os_version;
+};
+} // namespace
+
+static HostInfoAIXFields *g_fields = nullptr;
+
+void HostInfoAIX::Initialize(SharedLibraryDirectoryHelper *helper) {
+  HostInfoPosix::Initialize(helper);
+
+  g_fields = new HostInfoAIXFields();
+}
+
+void HostInfoAIX::Terminate() {
+  assert(g_fields && "Missing call to Initialize?");
+  delete g_fields;
+  g_fields = nullptr;
+  HostInfoBase::Terminate();
+}
+
+llvm::VersionTuple HostInfoAIX::GetOSVersion() {
+  assert(g_fields && "Missing call to Initialize?");
+  llvm::call_once(g_fields->m_os_version_once_flag, []() {
+    struct utsname un;
+    if (uname(&un) != 0)
+      return;
+
+    llvm::StringRef release = un.release;
+    // The kernel release string can include a lot of stuff (e.g.
+    // 4.9.0-6-amd64). We're only interested in the numbered prefix.
+    release = release.substr(0, release.find_first_not_of("0123456789."));
+    g_fields->m_os_version.tryParse(release);
+  });
+
+  return g_fields->m_os_version;
+}
+
+std::optional<std::string> HostInfoAIX::GetOSBuildString() {
+  struct utsname un;
+  ::memset(&un, 0, sizeof(utsname));
+
+  if (uname(&un) < 0)
+    return std::nullopt;
+
+  return std::string(un.release);
+}
+
+llvm::StringRef HostInfoAIX::GetDistributionId() {
+  assert(g_fields && "Missing call to Initialize?");
+  // Try to run 'lbs_release -i', and use that response for the distribution
+  // id.
+  llvm::call_once(g_fields->m_distribution_once_flag, []() {
+    Log *log = GetLog(LLDBLog::Host);
+    LLDB_LOGF(log, "attempting to determine AIX distribution...");
+
+    // check if the lsb_release command exists at one of the following paths
+    const char *const exe_paths[] = {"/bin/lsb_release",
+                                     "/usr/bin/lsb_release"};
+
+    for (size_t exe_index = 0;
+         exe_index < sizeof(exe_paths) / sizeof(exe_paths[0]); ++exe_index) {
+      const char *const get_distribution_info_exe = exe_paths[exe_index];
+      if (access(get_distribution_info_exe, F_OK)) {
+        // this exe doesn't exist, move on to next exe
+        LLDB_LOGF(log, "executable doesn't exist: %s",
+                  get_distribution_info_exe);
+        continue;
+      }
+
+      // execute the distribution-retrieval command, read output
+      std::string get_distribution_id_command(get_distribution_info_exe);
+      get_distribution_id_command += " -i";
+
+      FILE *file = popen(get_distribution_id_command.c_str(), "r");
+      if (!file) {
+        LLDB_LOGF(log,
+                  "failed to run command: \"%s\", cannot retrieve "
+                  "platform information",
+                  get_distribution_id_command.c_str());
+        break;
+      }
+
+      // retrieve the distribution id string.
+      char distribution_id[256] = {'\0'};
+      if (fgets(distribution_id, sizeof(distribution_id) - 1, file) !=
+          nullptr) {
+        LLDB_LOGF(log, "distribution id command returned \"%s\"",
+                  distribution_id);
+
+        const char *const distributor_id_key = "Distributor ID:\t";
+        if (strstr(distribution_id, distributor_id_key)) {
+          // strip newlines
+          std::string id_string(distribution_id + strlen(distributor_id_key));
+          id_string.erase(std::remove(id_string.begin(), id_string.end(), '\n'),
+                          id_string.end());
+
+          // lower case it and convert whitespace to underscores
+          std::transform(
+              id_string.begin(), id_string.end(), id_string.begin(),
+              [](char ch) { return tolower(isspace(ch) ? '_' : ch); });
+
+          g_fields->m_distribution_id = id_string;
+          LLDB_LOGF(log, "distribution id set to \"%s\"",
+                    g_fields->m_distribution_id.c_str());
+        } else {
+          LLDB_LOGF(log, "failed to find \"%s\" field in \"%s\"",
+                    distributor_id_key, distribution_id);
+        }
+      } else {
+        LLDB_LOGF(log,
+                  "failed to retrieve distribution id, \"%s\" returned no"
+                  " lines",
+                  get_distribution_id_command.c_str());
+      }
+
+      // clean up the file
+      pclose(file);
+    }
+  });
+
+  return g_fields->m_distribution_id;
+}
+
+FileSpec HostInfoAIX::GetProgramFileSpec() {
+  static FileSpec g_program_filespec;
+
+  if (!g_program_filespec) {
+    char exe_path[PATH_MAX];
+    ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1);
+    if (len > 0) {
+      exe_path[len] = 0;
+      g_program_filespec.SetFile(exe_path, FileSpec::Style::native);
+    }
+  }
+
+  return g_program_filespec;
+}
+
+bool HostInfoAIX::ComputeSupportExeDirectory(FileSpec &file_spec) {
+  if (HostInfoPosix::ComputeSupportExeDirectory(file_spec) &&
+      file_spec.IsAbsolute() && FileSystem::Instance().Exists(file_spec))
+    return true;
+  file_spec.SetDirectory(GetProgramFileSpec().GetDirectory());
+  return !file_spec.GetDirectory().IsEmpty();
+}
+
+bool HostInfoAIX::ComputeSystemPluginsDirectory(FileSpec &file_spec) {
+  FileSpec temp_file("/usr/" LLDB_INSTALL_LIBDIR_BASENAME "/lldb/plugins");
+  FileSystem::Instance().Resolve(temp_file);
+  file_spec.SetDirectory(temp_file.GetPath());
+  return true;
+}
+
+bool HostInfoAIX::ComputeUserPluginsDirectory(FileSpec &file_spec) {
+  // XDG Base Directory Specification
+  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html If
+  // XDG_DATA_HOME exists, use that, otherwise use ~/.local/share/lldb.
+  const char *xdg_data_home = getenv("XDG_DATA_HOME");
+  if (xdg_data_home && xdg_data_home[0]) {
+    std::string user_plugin_dir(xdg_data_home);
+    user_plugin_dir += "/lldb";
+    file_spec.SetDirectory(user_plugin_dir.c_str());
+  } else
+    file_spec.SetDirectory("~/.local/share/lldb");
+  return true;
+}
+
+void HostInfoAIX::ComputeHostArchitectureSupport(ArchSpec &arch_32,
+                                                   ArchSpec &arch_64) {
+  HostInfoPosix::ComputeHostArchitectureSupport(arch_32, arch_64);
+
+  const char *distribution_id = GetDistributionId().data();
+
+  // On Linux, "unknown" in the vendor slot isn't what we want for the default
+  // triple.  It's probably an artifact of config.guess.
+  if (arch_32.IsValid()) {
+    if (arch_32.GetTriple().getVendor() == llvm::Triple::UnknownVendor)
+      arch_32.GetTriple().setVendorName(llvm::StringRef());
+  }
+  if (arch_64.IsValid()) {
+    if (arch_64.GetTriple().getVendor() == llvm::Triple::UnknownVendor)
+      arch_64.GetTriple().setVendorName(llvm::StringRef());
+  }
+}
diff --git a/lldb/source/Host/aix/Support.cpp b/lldb/source/Host/aix/Support.cpp
new file mode 100644
index 0000000000000..1bf2662190127
--- /dev/null
+++ b/lldb/source/Host/aix/Support.cpp
@@ -0,0 +1,44 @@
+//===-- Support.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/aix/Support.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+lldb_private::getProcFile(::pid_t pid, ::pid_t tid, const llvm::Twine &file) {
+  Log *log = GetLog(LLDBLog::Host);
+  std::string File =
+      ("/proc/" + llvm::Twine(pid) + "/task/" + llvm::Twine(tid) + "/" + file)
+          .str();
+  auto Ret = llvm::MemoryBuffer::getFileAsStream(File);
+  if (!Ret)
+    LLDB_LOG(log, "Failed to open {0}: {1}", File, Ret.getError().message());
+  return Ret;
+}
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+lldb_private::getProcFile(::pid_t pid, const llvm::Twine &file) {
+  Log *log = GetLog(LLDBLog::Host);
+  std::string File = ("/proc/" + llvm::Twine(pid) + "/" + file).str();
+  auto Ret = llvm::MemoryBuffer::getFileAsStream(File);
+  if (!Ret)
+    LLDB_LOG(log, "Failed to open {0}: {1}", File, Ret.getError().message());
+  return Ret;
+}
+
+llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+lldb_private::getProcFile(const llvm::Twine &file) {
+  Log *log = GetLog(LLDBLog::Host);
+  std::string File = ("/proc/" + file).str();
+  auto Ret = llvm::MemoryBuffer::getFileAsStream(File);
+  if (!Ret)
+    LLDB_LOG(log, "Failed to open {0}: {1}", File, Ret.getError().message());
+  return Ret;
+}
diff --git a/lldb/source/Host/common/GetOptInc.cpp b/lldb/source/Host/common/GetOptInc.cpp
index c2044b6873221..e0ae2aa1774b3 100644
--- a/lldb/source/Host/common/GetOptInc.cpp
+++ b/lldb/source/Host/common/GetOptInc.cpp
@@ -409,7 +409,7 @@ static int getopt_internal(int nargc, char *const *nargv, const char *options,
 * [eventually this will replace the BSD getopt]
 */
 #if defined(REPLACE_GETOPT)
-int getopt(int nargc, char *const *nargv, const char *options) {
+int getopt(int nargc, char *const *nargv, const char *options) throw() {
 
   /*
   * We don't pass FLAG_PERMUTE to getopt_internal() since
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index e03d36e9cad4a..2fd7111a94fb2 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -357,9 +357,183 @@ bool Host::ResolveExecutableInBundle(FileSpec &file) { return false; }
 
 #ifndef _WIN32
 
+#if defined(__AIX__)
+
+#include <stdio.h>
+extern char **p_xargv;
+
+/* Fix missing Dl_info & dladdr in AIX
+ * The code is taken from netbsd.org (src/crypto/external/bsd/openssl/dist/crypto/dso/dso_dlfcn.c)
+ * except strlcpy & strlcat (those are taken from openbsd.org (src/lib/libc/string))
+ */
+/*-
+ * See IBM's AIX Version 7.2, Technical Reference:
+ *  Base Operating System and Extensions, Volume 1 and 2
+ *  https://www.ibm.com/support/knowledgecenter/ssw_aix_72/com.ibm.aix.base/technicalreferences.htm
+ */
+#include <sys/ldr.h>
+#include <errno.h>
+
+/* strlcpy:
+ * Copy string src to buffer dst of size dsize.  At most dsize-1
+ * chars will be copied.  Always NUL terminates (unless dsize == 0).
+ * Returns strlen(src); if retval >= dsize, truncation occurred.
+ */
+size_t strlcpy(char *dst, const char *src, size_t dsize)
+{
+    const char *osrc = src;
+    size_t nleft = dsize;
+
+    /* Copy as many bytes as will fit. */
+    if (nleft != 0) {
+        while (--nleft != 0) {
+            if ((*dst++ = *src++) == '\0') {
+                break;
+            }
+        }
+    }
+
+    /* Not enough room in dst, add NUL and traverse rest of src. */
+    if (nleft == 0) {
+        if (dsize != 0) {
+            *dst = '\0';		/* NUL-terminate dst */
+        }
+        while (*src++) {
+            ;
+        }
+    }
+
+    return src - osrc - 1;	/* count does not include NUL */
+}
+
+/* strlcat:
+ * Appends src to string dst of size dsize (unlike strncat, dsize is the
+ * full size of dst, not space left).  At most dsize-1 characters
+ * will be copied.  Always NUL terminates (unless dsize <= strlen(dst)).
+ * Returns strlen(src) + MIN(dsize, strlen(initial dst)).
+ * If retval >= dsize, truncation occurred.
+ */
+size_t strlcat(char *dst, const char *src, size_t dsize)
+{
+    const char *odst = dst;
+    const char *osrc = src;
+    size_t n = dsize;
+    size_t dlen;
+
+    /* Find the end of dst and adjust bytes left but don't go past end. */
+    while (n-- != 0 && *dst != '\0') {
+        dst++;
+    }
+    dlen = dst - odst;
+    n = dsize - dlen;
+
+    if (n-- == 0) {
+        return dlen + strlen(src);
+    }
+    while (*src != '\0') {
+        if (n != 0) {
+            *dst++ = *src;
+            n--;
+        }
+        src++;
+    }
+    *dst = '\0';
+
+    return dlen + src - osrc;	/* count does not include NUL */
+}
+
+/* ~ 64 * (sizeof(struct ld_info) + _XOPEN_PATH_MAX + _XOPEN_NAME_MAX) */
+#  define DLFCN_LDINFO_SIZE 86976
+typedef struct Dl_info {
+    const char *dli_fname;
+} Dl_info;
+/*
+ * This dladdr()-implementation will also find the ptrgl (Pointer Glue) virtual
+ * address of a function, which is just located in the DATA segment instead of
+ * the TEXT segment.
+ */
+static int dladdr(const void *ptr, Dl_info *dl)
+{
+    uintptr_t addr = (uintptr_t)ptr;
+    struct ld_info *ldinfos;
+    struct ld_info *next_ldi;
+    struct ld_info *this_ldi;
+
+    if ((ldinfos = (struct ld_info *)malloc(DLFCN_LDINFO_SIZE)) == NULL) {
+        dl->dli_fname = NULL;
+        return 0;
+    }
+
+    if ((loadquery(L_GETINFO, (void *)ldinfos, DLFCN_LDINFO_SIZE)) < 0) {
+        /*-
+         * Error handling is done through errno and dlerror() reading errno:
+         *  ENOMEM (ldinfos buffer is too small),
+         *  EINVAL (invalid flags),
+         *  EFAULT (invalid ldinfos ptr)
+         */
+        free((void *)ldinfos);
+        dl->dli_fname = NULL;
+        return 0;
+    }
+    next_ldi = ldinfos;
+
+    do {
+        this_ldi = next_ldi;
+        if (((addr >= (uintptr_t)this_ldi->ldinfo_textorg)
+             && (addr < ((uintptr_t)this_ldi->ldinfo_textorg +
+                         this_ldi->ldinfo_textsize)))
+            || ((addr >= (uintptr_t)this_ldi->ldinfo_dataorg)
+                && (addr < ((uintptr_t)this_ldi->ldinfo_dataorg +
+                            this_ldi->ldinfo_datasize)))) {
+            char *buffer = NULL;
+            char *member = NULL;
+            size_t buffer_sz;
+            size_t member_len;
+
+            buffer_sz = strlen(this_ldi->ldinfo_filename) + 1;
+            member = this_ldi->ldinfo_filename + buffer_sz;
+            if ((member_len = strlen(member)) > 0) {
+                buffer_sz += 1 + member_len + 1;
+            }
+            if ((buffer = (char *)malloc(buffer_sz)) != NULL) {
+                strlcpy(buffer, this_ldi->ldinfo_filename, buffer_sz);
+                if (member_len > 0) {
+                    /*
+                     * Need to respect a possible member name and not just
+                     * returning the path name in this case. See docs:
+                     * sys/ldr.h, loadquery() and dlopen()/RTLD_MEMBER.
+                     */
+                    strlcat(buffer, "(", buffer_sz);
+                    strlcat(buffer, member, buffer_sz);
+                    strlcat(buffer, ")", buffer_sz);
+                }
+                dl->dli_fname = buffer;
+            }
+            break;
+        } else {
+            next_ldi = (struct ld_info *)((uintptr_t)this_ldi +
+                                          this_ldi->ldinfo_next);
+        }
+    } while (this_ldi->ldinfo_next);
+    free((void *)ldinfos);
+    return dl->dli_fname != NULL;
+}
+
+#endif
+
 FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
   FileSpec module_filespec;
 #if !defined(__ANDROID__)
+#ifdef __AIX__
+  if (host_addr == reinterpret_cast<void *>(HostInfoBase::ComputeSharedLibraryDirectory)) {
+    // FIXME: AIX dladdr return "lldb" for this case
+    if (p_xargv[0]) {
+      module_filespec.SetFile(p_xargv[0], FileSpec::Style::native);
+      FileSystem::Instance().Resolve(module_filespec);
+      return module_filespec;
+    }
+  }
+#endif
   Dl_info info;
   if (::dladdr(host_addr, &info)) {
     if (info.dli_fname) {
@@ -373,12 +547,6 @@ FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
 
 #endif
 
-#if !defined(__linux__)
-bool Host::FindProcessThreads(const lldb::pid_t pid, TidMap &tids_to_attach) {
-  return false;
-}
-#endif
-
 struct ShellInfo {
   ShellInfo() : process_reaped(false) {}
 
diff --git a/lldb/source/Host/common/LICENSE.aix-netbsd.txt b/lldb/source/Host/common/LICENSE.aix-netbsd.txt
new file mode 100644
index 0000000000000..9601ab43575f9
--- /dev/null
+++ b/lldb/source/Host/common/LICENSE.aix-netbsd.txt
@@ -0,0 +1,125 @@
+
+  LICENSE ISSUES
+  ==============
+
+  The OpenSSL toolkit stays under a double license, i.e. both the conditions of
+  the OpenSSL License and the original SSLeay license apply to the toolkit.
+  See below for the actual license texts.
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2019 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
diff --git a/lldb/source/Host/common/XML.cpp b/lldb/source/Host/common/XML.cpp
index f480ef3166a44..62cac78aaac23 100644
--- a/lldb/source/Host/common/XML.cpp
+++ b/lldb/source/Host/common/XML.cpp
@@ -10,6 +10,9 @@
 #include "lldb/Host/XML.h"
 
 #include "llvm/ADT/StringExtras.h"
+#if defined(__AIX__)
+#undef LLDB_ENABLE_LIBXML2
+#endif
 
 using namespace lldb;
 using namespace lldb_private;
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index fceeff08ed9d3..143254bb12901 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -721,6 +721,7 @@ ConnectionFileDescriptor::ConnectFD(llvm::StringRef s,
 ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     llvm::StringRef s, socket_id_callback_type socket_id_callback,
     Status *error_ptr) {
+#if !defined(__AIX__)
 #if LLDB_ENABLE_POSIX
   std::string addr_str = s.str();
   // file:///PATH
@@ -753,6 +754,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectFile(
   m_io_sp = std::make_shared<NativeFile>(fd, File::eOpenOptionReadWrite, true);
   return eConnectionStatusSuccess;
 #endif // LLDB_ENABLE_POSIX
+#endif
   llvm_unreachable("this function should be only called w/ LLDB_ENABLE_POSIX");
 }
 
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index cdb76da626bc9..a7c50f6a3c835 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,7 +11,9 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
+#if !defined(__AIX__)
 #include <sys/mount.h>
+#endif
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/types.h>
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index 5fe4d015251c8..e5be0db4cf19b 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -179,9 +179,21 @@ Status MainLoopPosix::RunImpl::Poll() {
     read_fds.push_back(pfd);
   }
 
+#if defined(__AIX__)
+  sigset_t origmask;
+  int timeout;
+
+  timeout = -1;
+  pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+  int ready = poll(read_fds.data(), read_fds.size(), timeout);
+  pthread_sigmask(SIG_SETMASK, &origmask, nullptr);
+  if (ready == -1 && errno != EINTR)
+    return Status(errno, eErrorTypePOSIX);
+#else
   if (ppoll(read_fds.data(), read_fds.size(), nullptr, &sigmask) == -1 &&
       errno != EINTR)
     return Status(errno, eErrorTypePOSIX);
+#endif
 
   return Status();
 }
@@ -312,8 +324,13 @@ MainLoopPosix::RegisterSignal(int signo, const Callback &callback,
   // If we're using kqueue, the signal needs to be unblocked in order to
   // receive it. If using pselect/ppoll, we need to block it, and later unblock
   // it as a part of the system call.
+#if defined(__AIX__)
+  //FIXME: where is signal unblocked?
+  ret = pthread_sigmask(SIG_UNBLOCK, &new_action.sa_mask, &old_set);
+#else
   ret = pthread_sigmask(HAVE_SYS_EVENT_H ? SIG_UNBLOCK : SIG_BLOCK,
                         &new_action.sa_mask, &old_set);
+#endif
   assert(ret == 0 && "pthread_sigmask failed");
   info.was_blocked = sigismember(&old_set, signo);
   auto insert_ret = m_signals.insert({signo, info});
diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index 0a832ebad13a7..cd106f605b1f4 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -193,8 +193,13 @@ struct ForkLaunchInfo {
     }
 
     // Start tracing this child that is about to exec.
+#if !defined(__AIX__)
     if (ptrace(PT_TRACE_ME, 0, nullptr, 0) == -1)
       ExitWithError(error_fd, "ptrace");
+#else
+    if (ptrace64(PT_TRACE_ME, 0, 0, 0, nullptr) == -1)
+      ExitWithError(error_fd, "ptrace");
+#endif
   }
 
   // Execute.  We should never return...
diff --git a/lldb/source/Initialization/CMakeLists.txt b/lldb/source/Initialization/CMakeLists.txt
index c1a167826f76f..9f94830c8509f 100644
--- a/lldb/source/Initialization/CMakeLists.txt
+++ b/lldb/source/Initialization/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD" )
+if ( CMAKE_SYSTEM_NAME MATCHES "Linux|Android|FreeBSD|NetBSD|AIX" )
   list(APPEND EXTRA_PLUGINS lldbPluginProcessPOSIX)
 endif()
 
diff --git a/lldb/source/Initialization/SystemInitializerCommon.cpp b/lldb/source/Initialization/SystemInitializerCommon.cpp
index 1a172a95aa147..4b01442a94bac 100644
--- a/lldb/source/Initialization/SystemInitializerCommon.cpp
+++ b/lldb/source/Initialization/SystemInitializerCommon.cpp
@@ -19,7 +19,7 @@
 #include "lldb/Version/Version.h"
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) ||       \
-    defined(__OpenBSD__)
+    defined(__OpenBSD__) || defined(__AIX__)
 #include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
 #endif
 
@@ -79,7 +79,7 @@ llvm::Error SystemInitializerCommon::Initialize() {
   process_gdb_remote::ProcessGDBRemoteLog::Initialize();
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) ||       \
-    defined(__OpenBSD__)
+    defined(__OpenBSD__) || defined(__AIX__)
   ProcessPOSIXLog::Initialize();
 #endif
 #if defined(_WIN32)
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index eac058701313b..feb0d7c0e09be 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -156,6 +156,9 @@ bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
   if (!reg_ctx->WriteRegisterFromUnsigned(r12_reg_info, func_addr))
     return false;
 
+#if defined(__AIX__)
+  assert(0);
+#else
   // Read TOC pointer value.
   reg_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0);
 
@@ -171,6 +174,132 @@ bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
             (uint64_t)reg_value);
   if (!process_sp->WritePointerToMemory(sp + stack_offset, reg_value, error))
     return false;
+#endif
+
+  // Read the current SP value.
+  reg_value = reg_ctx->ReadRegisterAsUnsigned(sp_reg_info, 0);
+
+  // Save current SP onto the stack.
+  LLDB_LOGF(log, "Writing SP at SP(0x%" PRIx64 ")+0: 0x%" PRIx64, (uint64_t)sp,
+            (uint64_t)reg_value);
+  if (!process_sp->WritePointerToMemory(sp, reg_value, error))
+    return false;
+
+  // %r1 is set to the actual stack value.
+  LLDB_LOGF(log, "Writing SP: 0x%" PRIx64, (uint64_t)sp);
+
+  if (!reg_ctx->WriteRegisterFromUnsigned(sp_reg_info, sp))
+    return false;
+
+  // %pc is set to the address of the called function.
+
+  LLDB_LOGF(log, "Writing IP: 0x%" PRIx64, (uint64_t)func_addr);
+
+  if (!reg_ctx->WriteRegisterFromUnsigned(pc_reg_info, func_addr))
+    return false;
+
+  return true;
+}
+
+bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
+                                       addr_t func_addr, addr_t toc_addr,
+                                       addr_t return_addr,
+                                       llvm::ArrayRef<addr_t> args) const {
+  Log *log = GetLog(LLDBLog::Expressions);
+
+  if (log) {
+    StreamString s;
+    s.Printf("ABISysV_ppc64::PrepareTrivialCall (tid = 0x%" PRIx64
+             ", sp = 0x%" PRIx64 ", func_addr = 0x%" PRIx64
+             ", return_addr = 0x%" PRIx64,
+             thread.GetID(), (uint64_t)sp, (uint64_t)func_addr,
+             (uint64_t)return_addr);
+
+    for (size_t i = 0; i < args.size(); ++i)
+      s.Printf(", arg%" PRIu64 " = 0x%" PRIx64, static_cast<uint64_t>(i + 1),
+               args[i]);
+    s.PutCString(")");
+    log->PutString(s.GetString());
+  }
+
+  RegisterContext *reg_ctx = thread.GetRegisterContext().get();
+  if (!reg_ctx)
+    return false;
+
+  const RegisterInfo *reg_info = nullptr;
+
+  if (args.size() > 8) // TODO handle more than 8 arguments
+    return false;
+
+  for (size_t i = 0; i < args.size(); ++i) {
+    reg_info = reg_ctx->GetRegisterInfo(eRegisterKindGeneric,
+                                        LLDB_REGNUM_GENERIC_ARG1 + i);
+    LLDB_LOGF(log, "About to write arg%" PRIu64 " (0x%" PRIx64 ") into %s",
+              static_cast<uint64_t>(i + 1), args[i], reg_info->name);
+    if (!reg_ctx->WriteRegisterFromUnsigned(reg_info, args[i]))
+      return false;
+  }
+
+  // First, align the SP
+
+  LLDB_LOGF(log, "16-byte aligning SP: 0x%" PRIx64 " to 0x%" PRIx64,
+            (uint64_t)sp, (uint64_t)(sp & ~0xfull));
+
+  sp &= ~(0xfull); // 16-byte alignment
+
+  sp -= 544; // allocate frame to save TOC, RA and SP.
+
+  Status error;
+  uint64_t reg_value;
+  const RegisterInfo *pc_reg_info =
+      reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC);
+  const RegisterInfo *sp_reg_info =
+      reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_SP);
+  ProcessSP process_sp(thread.GetProcess());
+  const RegisterInfo *lr_reg_info =
+      reg_ctx->GetRegisterInfo(eRegisterKindGeneric, LLDB_REGNUM_GENERIC_RA);
+  const RegisterInfo *r2_reg_info = reg_ctx->GetRegisterInfoAtIndex(2);
+  const RegisterInfo *r12_reg_info = reg_ctx->GetRegisterInfoAtIndex(12);
+
+  // Save return address onto the stack.
+  LLDB_LOGF(log,
+            "Pushing the return address onto the stack: 0x%" PRIx64
+            "(+16): 0x%" PRIx64,
+            (uint64_t)sp, (uint64_t)return_addr);
+  if (!process_sp->WritePointerToMemory(sp + 16, return_addr, error))
+    return false;
+
+  // Write the return address to link register.
+  LLDB_LOGF(log, "Writing LR: 0x%" PRIx64, (uint64_t)return_addr);
+  if (!reg_ctx->WriteRegisterFromUnsigned(lr_reg_info, return_addr))
+    return false;
+
+  // Write target address to %r12 register.
+  LLDB_LOGF(log, "Writing R12: 0x%" PRIx64, (uint64_t)func_addr);
+  if (!reg_ctx->WriteRegisterFromUnsigned(r12_reg_info, func_addr))
+    return false;
+
+#if defined(__AIX__)
+  LLDB_LOGF(log, "Writing R2: 0x%" PRIx64, (uint64_t)toc_addr);
+  if (!reg_ctx->WriteRegisterFromUnsigned(r2_reg_info, toc_addr))
+    return false;
+#else
+  // Read TOC pointer value.
+  reg_value = reg_ctx->ReadRegisterAsUnsigned(r2_reg_info, 0);
+
+  // Write TOC pointer onto the stack.
+  uint64_t stack_offset;
+  if (GetByteOrder() == lldb::eByteOrderLittle)
+    stack_offset = 24;
+  else
+    stack_offset = 40;
+
+  LLDB_LOGF(log, "Writing R2 (TOC) at SP(0x%" PRIx64 ")+%d: 0x%" PRIx64,
+            (uint64_t)(sp + stack_offset), (int)stack_offset,
+            (uint64_t)reg_value);
+  if (!process_sp->WritePointerToMemory(sp + stack_offset, reg_value, error))
+    return false;
+#endif
 
   // Read the current SP value.
   reg_value = reg_ctx->ReadRegisterAsUnsigned(sp_reg_info, 0);
@@ -641,7 +770,7 @@ class ReturnValueExtractor {
 
     DataExtractor de(&raw_data, sizeof(raw_data), m_byte_order, m_addr_size);
 
-    offset_t offset = 0;
+    lldb::offset_t offset = 0;
     std::optional<uint64_t> byte_size = type.GetByteSize(m_process_sp.get());
     if (!byte_size)
       return {};
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.h b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.h
index bfa96cc0df703..d752a8ded9748 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.h
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.h
@@ -23,6 +23,12 @@ class ABISysV_ppc64 : public lldb_private::RegInfoBasedABI {
                           lldb::addr_t returnAddress,
                           llvm::ArrayRef<lldb::addr_t> args) const override;
 
+  bool PrepareTrivialCall(lldb_private::Thread &thread, lldb::addr_t sp,
+                          lldb::addr_t functionAddress,
+                          lldb::addr_t tocAddress,
+                          lldb::addr_t returnAddress,
+                          llvm::ArrayRef<lldb::addr_t> args) const override;
+
   bool GetArgumentValues(lldb_private::Thread &thread,
                          lldb_private::ValueList &values) const override;
 
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/CMakeLists.txt
new file mode 100644
index 0000000000000..02fe0d617955a
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_definitions("-D_ALL_SOURCE")
+
+add_lldb_library(lldbPluginDynamicLoaderAIXDYLD PLUGIN
+  DynamicLoaderAIXDYLD.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbTarget
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
new file mode 100644
index 0000000000000..62663974134b0
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -0,0 +1,272 @@
+//===-- DynamicLoaderAIXDYLD.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DynamicLoaderAIXDYLD.h"
+
+#include "lldb/Core/Module.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Target/ExecutionContext.h"
+#include "lldb/Target/Platform.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/RegisterContext.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Target/ThreadPlanStepInstruction.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#if defined(__AIX__)
+#include <sys/ldr.h>
+#endif
+
+/*#include "llvm/ADT/Triple.h"
+*/
+
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(DynamicLoaderAIXDYLD)
+
+DynamicLoaderAIXDYLD::DynamicLoaderAIXDYLD(Process *process)
+    : DynamicLoader(process) {}
+
+DynamicLoaderAIXDYLD::~DynamicLoaderAIXDYLD() = default;
+
+void DynamicLoaderAIXDYLD::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance);
+}
+
+void DynamicLoaderAIXDYLD::Terminate() {}
+
+llvm::StringRef DynamicLoaderAIXDYLD::GetPluginDescriptionStatic() {
+  return "Dynamic loader plug-in that watches for shared library "
+         "loads/unloads in AIX processes.";
+}
+
+DynamicLoader *DynamicLoaderAIXDYLD::CreateInstance(Process *process,
+                                                        bool force) {
+  bool should_create = force;
+  if (!should_create) {
+    const llvm::Triple &triple_ref =
+        process->GetTarget().GetArchitecture().GetTriple();
+    if (triple_ref.getOS() == llvm::Triple::AIX)
+      should_create = true;
+  }
+
+  if (should_create)
+    return new DynamicLoaderAIXDYLD(process);
+
+  return nullptr;
+}
+
+void DynamicLoaderAIXDYLD::OnLoadModule(lldb::ModuleSP module_sp,
+                                            const ModuleSpec module_spec,
+                                            lldb::addr_t module_addr) {
+
+  // Resolve the module unless we already have one.
+  if (!module_sp) {
+    Status error;
+    module_sp = m_process->GetTarget().GetOrCreateModule(module_spec, 
+                                             true /* notify */, &error);
+    if (error.Fail())
+      return;
+  }
+
+  m_loaded_modules[module_sp] = module_addr;
+  UpdateLoadedSectionsCommon(module_sp, module_addr, false);
+  ModuleList module_list;
+  module_list.Append(module_sp);
+  m_process->GetTarget().ModulesDidLoad(module_list);
+}
+
+void DynamicLoaderAIXDYLD::OnUnloadModule(lldb::addr_t module_addr) {
+  Address resolved_addr;
+  if (!m_process->GetTarget().ResolveLoadAddress(module_addr, resolved_addr))
+    return;
+
+  ModuleSP module_sp = resolved_addr.GetModule();
+  if (module_sp) {
+    m_loaded_modules.erase(module_sp);
+    UnloadSectionsCommon(module_sp);
+    ModuleList module_list;
+    module_list.Append(module_sp);
+    m_process->GetTarget().ModulesDidUnload(module_list, false);
+  }
+}
+
+lldb::addr_t DynamicLoaderAIXDYLD::GetLoadAddress(ModuleSP executable) {
+  // First, see if the load address is already cached.
+  auto it = m_loaded_modules.find(executable);
+  if (it != m_loaded_modules.end() && it->second != LLDB_INVALID_ADDRESS)
+    return it->second;
+
+  lldb::addr_t load_addr = LLDB_INVALID_ADDRESS;
+
+  // Second, try to get it through the process plugins.  For a remote process,
+  // the remote platform will be responsible for providing it.
+  FileSpec file_spec(executable->GetPlatformFileSpec());
+  bool is_loaded = false;
+  Status status =
+      m_process->GetFileLoadAddress(file_spec, is_loaded, load_addr);
+  // Servers other than lldb server could respond with a bogus address.
+  if (status.Success() && is_loaded && load_addr != LLDB_INVALID_ADDRESS) {
+    m_loaded_modules[executable] = load_addr;
+    return load_addr;
+  }
+
+  //// Hack to try set breakpoint
+  //Breakpoint *dyld_break = m_process->GetTarget().CreateBreakpoint(0x100000638, true, false).get();
+  //dyld_break->SetCallback(DynamicLoaderAIXDYLD::NotifyBreakpointHit, this, true);
+  //dyld_break->SetBreakpointKind("hack-debug");
+
+  return LLDB_INVALID_ADDRESS;
+}
+
+bool DynamicLoaderAIXDYLD::NotifyBreakpointHit(
+    void *baton, StoppointCallbackContext *context, lldb::user_id_t break_id,
+    lldb::user_id_t break_loc_id) {
+}
+
+void DynamicLoaderAIXDYLD::DidAttach() {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+  LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
+
+  ModuleSP executable = GetTargetExecutable();
+
+  if (!executable.get())
+    return;
+
+  // Try to fetch the load address of the file from the process, since there
+  // could be randomization of the load address.
+  lldb::addr_t load_addr = GetLoadAddress(executable);
+  if (load_addr == LLDB_INVALID_ADDRESS)
+    return;
+
+  // Request the process base address.
+  lldb::addr_t image_base = m_process->GetImageInfoAddress();
+  if (image_base == load_addr)
+    return;
+
+  // Rebase the process's modules if there is a mismatch.
+  UpdateLoadedSections(executable, LLDB_INVALID_ADDRESS, load_addr, false);
+
+  ModuleList module_list;
+  module_list.Append(executable);
+  m_process->GetTarget().ModulesDidLoad(module_list);
+  auto error = m_process->LoadModules();
+  LLDB_LOG_ERROR(log, std::move(error), "failed to load modules: {0}");
+
+#if defined(__AIX__)
+  // Get struct ld_xinfo (FIXME)
+  struct ld_xinfo ldinfo[64];
+  Status status = m_process->GetLDXINFO(&(ldinfo[0]));
+  if (status.Fail()) {
+    Log *log = GetLog(LLDBLog::DynamicLoader);
+    LLDB_LOG(log, "LDXINFO failed: {0}", status);
+    return;
+  }
+  struct ld_xinfo *ptr = &(ldinfo[0]);
+  bool skip_current = true;
+  while (ptr != nullptr) {
+    char *pathName = (char *)ptr + ptr->ldinfo_filename;
+    char *memberName = pathName + (strlen(pathName) + 1);
+    if (!skip_current) {
+      // FIXME: buffer size
+      char pathWithMember[128] = {0};
+      if (strlen(memberName) > 0) {
+        sprintf(pathWithMember, "%s(%s)", pathName, memberName);
+      } else {
+        sprintf(pathWithMember, "%s", pathName);
+      }
+      FileSpec file(pathWithMember);
+      ModuleSpec module_spec(file, m_process->GetTarget().GetArchitecture());
+      if (ModuleSP module_sp = m_process->GetTarget().GetOrCreateModule(module_spec, true /* notify */)) {
+        UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr->ldinfo_textorg, false, 1);
+        UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr->ldinfo_dataorg, false, 2);
+        // FIXME: .tdata, .bss
+      }
+    } else {
+      skip_current = false;
+    }
+    if (ptr->ldinfo_next == 0) {
+      ptr = nullptr;
+    } else {
+      ptr = (struct ld_xinfo *)((char *)ptr + ptr->ldinfo_next);
+    }
+  }
+#endif
+}
+
+void DynamicLoaderAIXDYLD::DidLaunch() {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+  LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
+
+  ModuleSP executable = GetTargetExecutable();
+  if (!executable.get())
+    return;
+
+  lldb::addr_t load_addr = GetLoadAddress(executable);
+  if (load_addr != LLDB_INVALID_ADDRESS) {
+    // Update the loaded sections so that the breakpoints can be resolved.
+    UpdateLoadedSections(executable, LLDB_INVALID_ADDRESS, load_addr, false);
+
+    ModuleList module_list;
+    module_list.Append(executable);
+    m_process->GetTarget().ModulesDidLoad(module_list);
+    auto error = m_process->LoadModules();
+    LLDB_LOG_ERROR(log, std::move(error), "failed to load modules: {0}");
+  }
+
+#if defined(__AIX__)
+  // Get struct ld_xinfo (FIXME)
+  struct ld_xinfo ldinfo[64];
+  Status status = m_process->GetLDXINFO(&(ldinfo[0]));
+  if (status.Fail()) {
+    Log *log = GetLog(LLDBLog::DynamicLoader);
+    LLDB_LOG(log, "LDXINFO failed: {0}", status);
+    return;
+  }
+  struct ld_xinfo *ptr = &(ldinfo[0]);
+  bool skip_current = true;
+  while (ptr != nullptr) {
+    char *pathName = (char *)ptr + ptr->ldinfo_filename;
+    char *memberName = pathName + (strlen(pathName) + 1);
+    if (!skip_current) {
+      // FIXME: buffer size
+      char pathWithMember[128] = {0};
+      if (strlen(memberName) > 0) {
+        sprintf(pathWithMember, "%s(%s)", pathName, memberName);
+      } else {
+        sprintf(pathWithMember, "%s", pathName);
+      }
+      FileSpec file(pathWithMember);
+      ModuleSpec module_spec(file, m_process->GetTarget().GetArchitecture());
+      if (ModuleSP module_sp = m_process->GetTarget().GetOrCreateModule(module_spec, true /* notify */)) {
+        UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr->ldinfo_textorg, false, 1);
+        UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr->ldinfo_dataorg, false, 2);
+        // FIXME: .tdata, .bss
+      }
+    } else {
+      skip_current = false;
+    }
+    if (ptr->ldinfo_next == 0) {
+      ptr = nullptr;
+    } else {
+      ptr = (struct ld_xinfo *)((char *)ptr + ptr->ldinfo_next);
+    }
+  }
+#endif
+}
+
+Status DynamicLoaderAIXDYLD::CanLoadImage() { return Status(); }
+
+ThreadPlanSP
+DynamicLoaderAIXDYLD::GetStepThroughTrampolinePlan(Thread &thread,
+                                                       bool stop) {
+  //FIXME
+  return ThreadPlanSP();
+}
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
new file mode 100644
index 0000000000000..ae4b7aca66dcc
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
@@ -0,0 +1,55 @@
+//===-- DynamicLoaderAIXDYLD.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_DYNAMICLOADER_AIX_DYLD_DYNAMICLOADERAIXDYLD_H
+#define LLDB_SOURCE_PLUGINS_DYNAMICLOADER_AIX_DYLD_DYNAMICLOADERAIXDYLD_H
+
+#include "lldb/Target/DynamicLoader.h"
+#include "lldb/lldb-forward.h"
+
+#include <map>
+
+namespace lldb_private {
+
+class DynamicLoaderAIXDYLD : public DynamicLoader {
+public:
+  DynamicLoaderAIXDYLD(Process *process);
+
+  ~DynamicLoaderAIXDYLD() override;
+
+  static void Initialize();
+  static void Terminate();
+  static llvm::StringRef GetPluginNameStatic() { return "windows-dyld"; }
+  static llvm::StringRef GetPluginDescriptionStatic();
+
+  static DynamicLoader *CreateInstance(Process *process, bool force);
+
+  void OnLoadModule(lldb::ModuleSP module_sp, const ModuleSpec module_spec,
+                    lldb::addr_t module_addr);
+  void OnUnloadModule(lldb::addr_t module_addr);
+
+  void DidAttach() override;
+  void DidLaunch() override;
+  Status CanLoadImage() override;
+  lldb::ThreadPlanSP GetStepThroughTrampolinePlan(Thread &thread,
+                                                  bool stop) override;
+
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  static bool NotifyBreakpointHit(void *baton, StoppointCallbackContext *context, lldb::user_id_t break_id, lldb::user_id_t break_loc_id);
+
+protected:
+  lldb::addr_t GetLoadAddress(lldb::ModuleSP executable);
+
+private:
+  std::map<lldb::ModuleSP, lldb::addr_t> m_loaded_modules;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_AIX_DYLD_DYNAMICLOADERWAIXDYLD_H
diff --git a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
index 30607159acdc0..4f3fb693faae1 100644
--- a/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
+++ b/lldb/source/Plugins/DynamicLoader/CMakeLists.txt
@@ -5,4 +5,5 @@ add_subdirectory(POSIX-DYLD)
 add_subdirectory(Static)
 add_subdirectory(Hexagon-DYLD)
 add_subdirectory(Windows-DYLD)
+add_subdirectory(AIX-DYLD)
 add_subdirectory(wasm-DYLD)
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 20e5652c65bf8..26abea0fdd24d 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -256,7 +256,7 @@ DynamicLoaderDarwinKernel::SearchForKernelWithDebugHints(Process *process) {
       if (process->ReadMemoryFromInferior (kernel_addresses_64[i], uval, 8, read_err) == 8)
       {
           DataExtractor data (&uval, 8, process->GetByteOrder(), process->GetAddressByteSize());
-          offset_t offset = 0;
+          lldb::offset_t offset = 0;
           uint64_t addr = data.GetU64 (&offset);
           if (CheckForKernelImageAtAddress(addr, process).IsValid()) {
               return addr;
@@ -270,7 +270,7 @@ DynamicLoaderDarwinKernel::SearchForKernelWithDebugHints(Process *process) {
       if (process->ReadMemoryFromInferior (kernel_addresses_32[i], uval, 4, read_err) == 4)
       {
           DataExtractor data (&uval, 4, process->GetByteOrder(), process->GetAddressByteSize());
-          offset_t offset = 0;
+          lldb::offset_t offset = 0;
           uint32_t addr = data.GetU32 (&offset);
           if (CheckForKernelImageAtAddress(addr, process).IsValid()) {
               return addr;
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 3863b6b3520db..624848dee6ec3 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -1151,7 +1151,7 @@ DynamicLoaderDarwin::GetThreadLocalData(const lldb::ModuleSP module_sp,
     // TLS data for the pthread_key on a specific thread yet. If we have we
     // can re-use it since its location will not change unless the process
     // execs.
-    const tid_t tid = thread_sp->GetID();
+    const lldb::tid_t tid = thread_sp->GetID();
     auto tid_pos = m_tid_to_tls_map.find(tid);
     if (tid_pos != m_tid_to_tls_map.end()) {
       auto tls_pos = tid_pos->second.find(key);
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
index 3035c51341778..d14ae2daeb47d 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.cpp
@@ -146,7 +146,25 @@ EmulateInstructionPPC64::GetOpcodeForInstruction(uint32_t opcode) {
       {0xfc000000, 0x38000000, &EmulateInstructionPPC64::EmulateADDI,
        "addi RT, RA, SI"},
       {0xfc000003, 0xe8000000, &EmulateInstructionPPC64::EmulateLD,
-       "ld RT, DS(RA)"}};
+       "ld RT, DS(RA)"},
+//      {0xffff0003, 0x40820000, &EmulateInstructionPPC64::EmulateBNE,
+//       "bne TARGET"},
+      {0xfc000002, 0x48000000, &EmulateInstructionPPC64::EmulateB,
+       "b TARGET"},
+      {0xfc000003, 0x48000002, &EmulateInstructionPPC64::EmulateBA,
+       "ba TARGET"},
+      {0xfc000003, 0x48000003, &EmulateInstructionPPC64::EmulateBLA,
+       "bla TARGET"},
+      {0xfc000002, 0x40000000, &EmulateInstructionPPC64::EmulateBC,
+       "bc BO,BI,TARGET"},
+      {0xfc000002, 0x40000002, &EmulateInstructionPPC64::EmulateBCA,
+       "bca BO,BI,TARGET"},
+      {0xfc0007fe, 0x4c000020, &EmulateInstructionPPC64::EmulateBCLR,
+       "bclr BO,BI,BH"},
+      {0xfc0007fe, 0x4c000420, &EmulateInstructionPPC64::EmulateBCCTR,
+       "bcctr BO,BI,BH"},
+      {0xfc0007fe, 0x4c000460, &EmulateInstructionPPC64::EmulateBCTAR,
+       "bctar BO,BI,BH"}};
   static const size_t k_num_ppc_opcodes = std::size(g_opcodes);
 
   for (size_t i = 0; i < k_num_ppc_opcodes; ++i) {
@@ -169,12 +187,13 @@ bool EmulateInstructionPPC64::EvaluateInstruction(uint32_t evaluate_options) {
 
   bool success = false;
 
-  uint32_t orig_pc_value = 0;
+  uint64_t orig_pc_value = 0;
   if (auto_advance_pc) {
     orig_pc_value =
         ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
     if (!success)
       return false;
+    LLDB_LOG(GetLog(LLDBLog::Unwind), "orig_pc_value:{0}", orig_pc_value);
   }
 
   // Call the Emulate... function.
@@ -183,11 +202,13 @@ bool EmulateInstructionPPC64::EvaluateInstruction(uint32_t evaluate_options) {
     return false;
 
   if (auto_advance_pc) {
-    uint32_t new_pc_value =
+    uint64_t new_pc_value =
         ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
     if (!success)
       return false;
 
+    LLDB_LOG(GetLog(LLDBLog::Unwind), "new_pc_value:{0}", new_pc_value);
+
     if (new_pc_value == orig_pc_value) {
       EmulateInstruction::Context context;
       context.type = eContextAdvancePC;
@@ -389,5 +410,174 @@ bool EmulateInstructionPPC64::EmulateADDI(uint32_t opcode) {
     return false;
   WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_r1_ppc64le, r1 + si_val);
   LLDB_LOG(log, "EmulateADDI: success!");
+
+  // FIX the next-pc
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBC(uint32_t opcode) {
+  // FIXME:32bit M
+  uint32_t M = 0;
+  uint32_t target32 = Bits32(opcode, 15, 2) << 2;
+  uint64_t target = (uint64_t)target32 + ((target32 & 0x8000) ? 0xffffffffffff0000UL : 0);
+  uint32_t BO = Bits32(opcode, 25, 21);
+  bool success;
+  uint64_t ctr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_ctr_ppc64le, 0, &success);
+  if ((~BO) & (1U << 2))
+    ctr_value = ctr_value - 1;
+  bool ctr_ok = (bool)(BO & (1U << 2)) | ((bool)(ctr_value != 0) ^ (bool)(BO & (1U << 1)));
+  uint64_t cr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_cr_ppc64le, 0, &success);
+  uint32_t BI = Bits32(opcode, 20, 16);
+  bool cond_ok = (bool)(BO & (1U << 4)) | (bool)(((cr_value >> (63 - (BI + 32))) & 1U) == ((BO >> 3) & 1U));
+
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+  if (ctr_ok & cond_ok)
+    next_pc = pc_value + target;
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  Log *log = GetLog(LLDBLog::Unwind);
+  LLDB_LOG(log, "EmulateBC: success!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBCA(uint32_t opcode) {
+  // FIXME:32bit M
+  uint32_t M = 0;
+  uint32_t target32 = Bits32(opcode, 15, 2) << 2;
+  uint64_t target = (uint64_t)target32 + ((target32 & 0x8000) ? 0xffffffffffff0000UL : 0);
+  uint32_t BO = Bits32(opcode, 25, 21);
+  bool success;
+  uint64_t ctr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_ctr_ppc64le, 0, &success);
+  if ((~BO) & (1U << 2))
+    ctr_value = ctr_value - 1;
+  bool ctr_ok = (bool)(BO & (1U << 2)) | ((bool)(ctr_value != 0) ^ (bool)(BO & (1U << 1)));
+  uint64_t cr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_cr_ppc64le, 0, &success);
+  uint32_t BI = Bits32(opcode, 20, 16);
+  bool cond_ok = (bool)(BO & (1U << 4)) | (bool)(((cr_value >> (63 - (BI + 32))) & 1U) == ((BO >> 3) & 1U));
+
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+  if (ctr_ok & cond_ok)
+    next_pc = target;
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  Log *log = GetLog(LLDBLog::Unwind);
+  LLDB_LOG(log, "EmulateBCA: success!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBCLR(uint32_t opcode) {
+  // FIXME:32bit M
+  uint32_t M = 0;
+  uint32_t BO = Bits32(opcode, 25, 21);
+  bool success;
+  uint64_t ctr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_ctr_ppc64le, 0, &success);
+  if ((~BO) & (1U << 2))
+    ctr_value = ctr_value - 1;
+  bool ctr_ok = (bool)(BO & (1U << 2)) | ((bool)(ctr_value != 0) ^ (bool)(BO & (1U << 1)));
+  uint64_t cr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_cr_ppc64le, 0, &success);
+  uint32_t BI = Bits32(opcode, 20, 16);
+  bool cond_ok = (bool)(BO & (1U << 4)) | (bool)(((cr_value >> (63 - (BI + 32))) & 1U) == ((BO >> 3) & 1U));
+
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+  if (ctr_ok & cond_ok) {
+    next_pc = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_lr_ppc64le, 0, &success);
+    next_pc &= ~((1UL << 2) - 1);
+  }
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  Log *log = GetLog(LLDBLog::Unwind);
+  LLDB_LOG(log, "EmulateBCLR: success!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBCCTR(uint32_t opcode) {
+  // FIXME:32bit M
+  uint32_t M = 0;
+  uint32_t BO = Bits32(opcode, 25, 21);
+  bool success;
+  uint64_t cr_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_cr_ppc64le, 0, &success);
+  uint32_t BI = Bits32(opcode, 20, 16);
+  bool cond_ok = (bool)(BO & (1U << 4)) | (bool)(((cr_value >> (63 - (BI + 32))) & 1U) == ((BO >> 3) & 1U));
+
+  Log *log = GetLog(LLDBLog::Unwind);
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+  if (cond_ok) {
+    next_pc = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_ctr_ppc64le, 0, &success);
+    next_pc &= ~((1UL << 2) - 1);
+    if (next_pc < 0x4000000) {
+      LLDB_LOGF(log, "EmulateBCCTR: next address %lx out of range, emulate by goto LR!");
+      next_pc = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_lr_ppc64le, 0, &success);
+    }
+  }
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  LLDB_LOG(log, "EmulateBCCTR: success!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBCTAR(uint32_t opcode) {
+  // Not supported yet.
+  LLDB_LOG(GetLog(LLDBLog::Unwind), "EmulateBCTAR: not supported!");
+  assert(0);
+  return false;
+}
+
+bool EmulateInstructionPPC64::EmulateB(uint32_t opcode) {
+  uint32_t target32 = Bits32(opcode, 25, 2) << 2;
+  uint64_t target = (uint64_t)target32 + ((target32 & 0x2000000) ? 0xfffffffffc000000UL : 0);
+
+  bool success;
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + target;
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  Log *log = GetLog(LLDBLog::Unwind);
+  LLDB_LOG(log, "EmulateB: success!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBA(uint32_t opcode) {
+  Log *log = GetLog(LLDBLog::Unwind);
+
+  bool success;
+  uint64_t next_pc = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_lr_ppc64le, 0, &success);
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  LLDB_LOG(log, "EmulateBA: emulate by branch to lr!");
+  return true;
+}
+
+bool EmulateInstructionPPC64::EmulateBLA(uint32_t opcode) {
+  Log *log = GetLog(LLDBLog::Unwind);
+
+  bool success;
+  uint64_t pc_value = ReadRegisterUnsigned(eRegisterKindLLDB, gpr_pc_ppc64le, 0, &success);
+  uint64_t next_pc = pc_value + 4;
+
+  Context ctx;
+  ctx.type = eContextAdjustPC;
+  WriteRegisterUnsigned(ctx, eRegisterKindLLDB, gpr_pc_ppc64le, next_pc);
+  LLDB_LOG(log, "EmulateBLA: emulate by branch to lr!");
   return true;
 }
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
index a9424f16b0ad0..1576c9700e557 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
@@ -39,6 +39,12 @@ class EmulateInstructionPPC64 : public EmulateInstruction {
       return true;
 
     case eInstructionTypePCModifying:
+#if defined(__AIX__)
+      return true;
+#else
+      return false;
+#endif
+
     case eInstructionTypeAll:
       return false;
     }
@@ -84,6 +90,14 @@ class EmulateInstructionPPC64 : public EmulateInstruction {
   bool EmulateSTD(uint32_t opcode);
   bool EmulateOR(uint32_t opcode);
   bool EmulateADDI(uint32_t opcode);
+  bool EmulateB(uint32_t opcode);
+  bool EmulateBA(uint32_t opcode);
+  bool EmulateBLA(uint32_t opcode);
+  bool EmulateBC(uint32_t opcode);
+  bool EmulateBCA(uint32_t opcode);
+  bool EmulateBCLR(uint32_t opcode);
+  bool EmulateBCCTR(uint32_t opcode);
+  bool EmulateBCTAR(uint32_t opcode);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
index b7cd2b1ac6bf6..876e74056face 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/MainThreadChecker/InstrumentationRuntimeMainThreadChecker.cpp
@@ -261,7 +261,7 @@ InstrumentationRuntimeMainThreadChecker::GetBacktracesFromExtendedStopInfo(
 
   StructuredData::ObjectSP thread_id_obj =
       info->GetObjectForDotSeparatedPath("tid");
-  tid_t tid = thread_id_obj ? thread_id_obj->GetUnsignedIntegerValue() : 0;
+  lldb::tid_t tid = thread_id_obj ? thread_id_obj->GetUnsignedIntegerValue() : 0;
 
   // We gather symbolication addresses above, so no need for HistoryThread to
   // try to infer the call addresses.
diff --git a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
index b2781aa5e7db1..7a827a3ea76f9 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/TSan/InstrumentationRuntimeTSan.cpp
@@ -770,13 +770,13 @@ std::string InstrumentationRuntimeTSan::GetLocationDescription(
             Sprintf("Location is a %ld-byte heap object at 0x%llx", size, addr);
       }
     } else if (type == "stack") {
-      tid_t tid = loc->GetAsDictionary()
+        lldb::tid_t tid = loc->GetAsDictionary()
                       ->GetValueForKey("thread_id")
                       ->GetUnsignedIntegerValue();
 
       result = Sprintf("Location is stack of thread %d", tid);
     } else if (type == "tls") {
-      tid_t tid = loc->GetAsDictionary()
+        lldb::tid_t tid = loc->GetAsDictionary()
                       ->GetValueForKey("thread_id")
                       ->GetUnsignedIntegerValue();
 
@@ -948,7 +948,7 @@ static std::string GenerateThreadName(const std::string &path,
   if (path == "mops") {
     size_t size =
         o->GetObjectForDotSeparatedPath("size")->GetUnsignedIntegerValue();
-    tid_t thread_id =
+    lldb::tid_t thread_id =
         o->GetObjectForDotSeparatedPath("thread_id")->GetUnsignedIntegerValue();
     bool is_write =
         o->GetObjectForDotSeparatedPath("is_write")->GetBooleanValue();
@@ -979,7 +979,7 @@ static std::string GenerateThreadName(const std::string &path,
   }
 
   if (path == "threads") {
-    tid_t thread_id =
+      lldb::tid_t thread_id =
         o->GetObjectForDotSeparatedPath("thread_id")->GetUnsignedIntegerValue();
     result = Sprintf("Thread %zu created", thread_id);
   }
@@ -987,7 +987,7 @@ static std::string GenerateThreadName(const std::string &path,
   if (path == "locs") {
     std::string type = std::string(
         o->GetAsDictionary()->GetValueForKey("type")->GetStringValue());
-    tid_t thread_id =
+    lldb::tid_t thread_id =
         o->GetObjectForDotSeparatedPath("thread_id")->GetUnsignedIntegerValue();
     int fd = o->GetObjectForDotSeparatedPath("file_descriptor")
                  ->GetSignedIntegerValue();
@@ -1007,7 +1007,7 @@ static std::string GenerateThreadName(const std::string &path,
   }
 
   if (path == "stacks") {
-    tid_t thread_id =
+      lldb::tid_t thread_id =
         o->GetObjectForDotSeparatedPath("thread_id")->GetUnsignedIntegerValue();
     result = Sprintf("Thread %" PRIu64, thread_id);
   }
@@ -1034,7 +1034,7 @@ static void AddThreadsForPath(const std::string &path,
 
         StructuredData::ObjectSP thread_id_obj =
             o->GetObjectForDotSeparatedPath("thread_os_id");
-        tid_t tid =
+            lldb::tid_t tid =
             thread_id_obj ? thread_id_obj->GetUnsignedIntegerValue() : 0;
 
         ThreadSP new_thread_sp =
diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
index 1c58922e8d36c..de9719ad4a89e 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp
@@ -321,7 +321,7 @@ InstrumentationRuntimeUBSan::GetBacktracesFromExtendedStopInfo(
 
   StructuredData::ObjectSP thread_id_obj =
       info->GetObjectForDotSeparatedPath("tid");
-  tid_t tid = thread_id_obj ? thread_id_obj->GetUnsignedIntegerValue() : 0;
+  lldb::tid_t tid = thread_id_obj ? thread_id_obj->GetUnsignedIntegerValue() : 0;
 
   // We gather symbolication addresses above, so no need for HistoryThread to
   // try to infer the call addresses.
diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
index 1688fb27430a7..690fb0d60a09a 100644
--- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
+++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
@@ -194,6 +194,10 @@ void JITLoaderGDB::SetJITBreakpoint(lldb_private::ModuleList &module_list) {
   if (jit_addr == LLDB_INVALID_ADDRESS)
     return;
 
+#if defined(__AIX__)
+  return;
+#endif
+
   m_jit_descriptor_addr = GetSymbolAddress(
       module_list, ConstString("__jit_debug_descriptor"), eSymbolTypeData);
   if (m_jit_descriptor_addr == LLDB_INVALID_ADDRESS) {
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index 341923108e321..fb5bc2c58e6fb 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -1227,6 +1227,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
 time_t lldb_private::formatters::GetOSXEpoch() {
   static time_t epoch = 0;
   if (!epoch) {
+#if !defined(__AIX__)
 #ifndef _WIN32
     tzset();
     tm tm_epoch;
@@ -1240,6 +1241,7 @@ time_t lldb_private::formatters::GetOSXEpoch() {
     tm_epoch.tm_gmtoff = 0;
     tm_epoch.tm_zone = nullptr;
     epoch = timegm(&tm_epoch);
+#endif
 #endif
   }
   return epoch;
diff --git a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
index 6efd2516578ff..fe6c5a0544be3 100644
--- a/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
+++ b/lldb/source/Plugins/MemoryHistory/asan/MemoryHistoryASan.cpp
@@ -107,7 +107,7 @@ static void CreateHistoryThreadFromValueObject(ProcessSP process_sp,
     return;
 
   int count = count_sp->GetValueAsUnsigned(0);
-  tid_t tid = tid_sp->GetValueAsUnsigned(0) + 1;
+  lldb::tid_t tid = tid_sp->GetValueAsUnsigned(0) + 1;
 
   if (count <= 0)
     return;
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 7aa5b8d81890a..5ea55772c3aba 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -8,7 +8,7 @@
 
 #include "ObjectContainerBSDArchive.h"
 
-#if defined(_WIN32) || defined(__ANDROID__)
+#if defined(_WIN32) || defined(__ANDROID__) || defined(__AIX__)
 // Defines from ar, missing on Windows
 #define SARMAG 8
 #define ARFMAG "`\n"
diff --git a/lldb/source/Plugins/ObjectContainer/Big-Archive/CMakeLists.txt b/lldb/source/Plugins/ObjectContainer/Big-Archive/CMakeLists.txt
new file mode 100644
index 0000000000000..612a36265b536
--- /dev/null
+++ b/lldb/source/Plugins/ObjectContainer/Big-Archive/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_library(lldbPluginObjectContainerBigArchive PLUGIN
+  ObjectContainerBigArchive.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbSymbol
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
new file mode 100644
index 0000000000000..050ad73f1d19a
--- /dev/null
+++ b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
@@ -0,0 +1,522 @@
+//===-- ObjectContainerBigArchive.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjectContainerBigArchive.h"
+
+#if defined(_WIN32) || defined(__ANDROID__) || defined(__AIX__)
+// Defines from ar, missing on Windows
+#define ARMAG "!<arch>\n"
+#define SARMAG 8
+#define ARFMAG "`\n"
+
+typedef struct ar_hdr {
+  char ar_name[16];
+  char ar_date[12];
+  char ar_uid[6], ar_gid[6];
+  char ar_mode[8];
+  char ar_size[10];
+  char ar_fmag[2];
+} ar_hdr;
+#else
+#include <ar.h>
+#endif
+
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/Stream.h"
+#include "lldb/Utility/Timer.h"
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Support/Chrono.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(ObjectContainerBigArchive)
+
+ObjectContainerBigArchive::Object::Object() : ar_name() {}
+
+void ObjectContainerBigArchive::Object::Clear() {
+  ar_name.Clear();
+  modification_time = 0;
+  uid = 0;
+  gid = 0;
+  mode = 0;
+  size = 0;
+  file_offset = 0;
+  file_size = 0;
+}
+
+lldb::offset_t
+ObjectContainerBigArchive::Object::Extract(const DataExtractor &data,
+                                           lldb::offset_t offset) {
+  size_t ar_name_len = 0;
+  std::string str;
+  char *err;
+
+  // File header
+  //
+  // The common format is as follows.
+  //
+  //  Offset  Length	Name            Format
+  //  0       16      File name       ASCII right padded with spaces (no spaces
+  //  allowed in file name)
+  //  16      12      File mod        Decimal as cstring right padded with
+  //  spaces
+  //  28      6       Owner ID        Decimal as cstring right padded with
+  //  spaces
+  //  34      6       Group ID        Decimal as cstring right padded with
+  //  spaces
+  //  40      8       File mode       Octal   as cstring right padded with
+  //  spaces
+  //  48      10      File byte size  Decimal as cstring right padded with
+  //  spaces
+  //  58      2       File magic      0x60 0x0A
+
+  // Make sure there is enough data for the file header and bail if not
+  if (!data.ValidOffsetForDataOfSize(offset, 60))
+    return LLDB_INVALID_OFFSET;
+
+  str.assign((const char *)data.GetData(&offset, 16), 16);
+  if (llvm::StringRef(str).starts_with("#1/")) {
+    // If the name is longer than 16 bytes, or contains an embedded space then
+    // it will use this format where the length of the name is here and the
+    // name characters are after this header.
+    ar_name_len = strtoul(str.c_str() + 3, &err, 10);
+  } else {
+    // Strip off any trailing spaces.
+    const size_t last_pos = str.find_last_not_of(' ');
+    if (last_pos != std::string::npos) {
+      if (last_pos + 1 < 16)
+        str.erase(last_pos + 1);
+    }
+    ar_name.SetCString(str.c_str());
+  }
+
+  str.assign((const char *)data.GetData(&offset, 12), 12);
+  modification_time = strtoul(str.c_str(), &err, 10);
+
+  str.assign((const char *)data.GetData(&offset, 6), 6);
+  uid = strtoul(str.c_str(), &err, 10);
+
+  str.assign((const char *)data.GetData(&offset, 6), 6);
+  gid = strtoul(str.c_str(), &err, 10);
+
+  str.assign((const char *)data.GetData(&offset, 8), 8);
+  mode = strtoul(str.c_str(), &err, 8);
+
+  str.assign((const char *)data.GetData(&offset, 10), 10);
+  size = strtoul(str.c_str(), &err, 10);
+
+  str.assign((const char *)data.GetData(&offset, 2), 2);
+  if (str == ARFMAG) {
+    if (ar_name_len > 0) {
+      const void *ar_name_ptr = data.GetData(&offset, ar_name_len);
+      // Make sure there was enough data for the string value and bail if not
+      if (ar_name_ptr == nullptr)
+        return LLDB_INVALID_OFFSET;
+      str.assign((const char *)ar_name_ptr, ar_name_len);
+      ar_name.SetCString(str.c_str());
+    }
+    file_offset = offset;
+    file_size = size - ar_name_len;
+    return offset;
+  }
+  return LLDB_INVALID_OFFSET;
+}
+
+ObjectContainerBigArchive::Archive::Archive(const lldb_private::ArchSpec &arch,
+                                            const llvm::sys::TimePoint<> &time,
+                                            lldb::offset_t file_offset,
+                                            lldb_private::DataExtractor &data)
+    : m_arch(arch), m_modification_time(time), m_file_offset(file_offset),
+      m_objects(), m_data(data) {}
+
+ObjectContainerBigArchive::Archive::~Archive() = default;
+
+size_t ObjectContainerBigArchive::Archive::ParseObjects() {
+  DataExtractor &data = m_data;
+  std::string str;
+  lldb::offset_t offset = 0;
+  str.assign((const char *)data.GetData(&offset, (sizeof(llvm::object::BigArchiveMagic) - 1)),
+             (sizeof(llvm::object::BigArchiveMagic) - 1));
+  if (str == llvm::object::BigArchiveMagic) {
+    llvm::Error err = llvm::Error::success();
+    llvm::object::BigArchive bigAr(llvm::MemoryBufferRef(toStringRef(m_data.GetData()), llvm::StringRef("")), err);
+    if (err)
+      return 0;
+
+    for (const llvm::object::Archive::Child &child : bigAr.children(err)) {
+      if (err)
+        continue;
+      if (!child.getParent())
+        continue;
+      Object obj;
+      obj.Clear();
+      // FIXME: check errors
+      llvm::Expected<llvm::StringRef> childNameOrErr = child.getName();
+      if (!childNameOrErr)
+        continue;
+      obj.ar_name.SetCString(childNameOrErr->str().c_str());
+      llvm::Expected<llvm::sys::TimePoint<std::chrono::seconds>> lastModifiedOrErr = child.getLastModified();
+      if (!lastModifiedOrErr)
+        continue;
+      obj.modification_time = (uint32_t)llvm::sys::toTimeT(*(lastModifiedOrErr));
+      llvm::Expected<unsigned> getUIDOrErr = child.getUID();
+      if (!getUIDOrErr)
+        continue;
+      obj.uid = (uint16_t)*getUIDOrErr;
+      llvm::Expected<unsigned> getGIDOrErr = child.getGID();
+      if (!getGIDOrErr)
+        continue;
+      obj.gid = (uint16_t)*getGIDOrErr;
+      llvm::Expected<llvm::sys::fs::perms> getAccessModeOrErr = child.getAccessMode();
+      if (!getAccessModeOrErr)
+        continue;
+      obj.mode = (uint16_t)*getAccessModeOrErr;
+      llvm::Expected<uint64_t> getRawSizeOrErr = child.getRawSize();
+      if (!getRawSizeOrErr)
+        continue;
+      obj.size = (uint32_t)*getRawSizeOrErr;
+
+      obj.file_offset = (lldb::offset_t)child.getDataOffset();
+
+      llvm::Expected<uint64_t> getSizeOrErr = child.getSize();
+      if (!getSizeOrErr)
+        continue;
+      obj.file_size = (lldb::offset_t)*getSizeOrErr;
+
+      size_t obj_idx = m_objects.size();
+      m_objects.push_back(obj);
+      // Insert all of the C strings out of order for now...
+      m_object_name_to_index_map.Append(obj.ar_name, obj_idx);
+    }
+    if (err)
+      return 0;
+
+    // Now sort all of the object name pointers
+    m_object_name_to_index_map.Sort();
+  }
+  return m_objects.size();
+}
+
+ObjectContainerBigArchive::Object *
+ObjectContainerBigArchive::Archive::FindObject(
+    ConstString object_name, const llvm::sys::TimePoint<> &object_mod_time) {
+  const ObjectNameToIndexMap::Entry *match =
+      m_object_name_to_index_map.FindFirstValueForName(object_name);
+  if (!match)
+    return nullptr;
+  if (object_mod_time == llvm::sys::TimePoint<>())
+    return &m_objects[match->value];
+
+  const uint64_t object_modification_date = llvm::sys::toTimeT(object_mod_time);
+  if (m_objects[match->value].modification_time == object_modification_date)
+    return &m_objects[match->value];
+
+  const ObjectNameToIndexMap::Entry *next_match =
+      m_object_name_to_index_map.FindNextValueForName(match);
+  while (next_match) {
+    if (m_objects[next_match->value].modification_time ==
+        object_modification_date)
+      return &m_objects[next_match->value];
+    next_match = m_object_name_to_index_map.FindNextValueForName(next_match);
+  }
+
+  return nullptr;
+}
+
+ObjectContainerBigArchive::Archive::shared_ptr
+ObjectContainerBigArchive::Archive::FindCachedArchive(
+    const FileSpec &file, const ArchSpec &arch,
+    const llvm::sys::TimePoint<> &time, lldb::offset_t file_offset) {
+  std::lock_guard<std::recursive_mutex> guard(Archive::GetArchiveCacheMutex());
+  shared_ptr archive_sp;
+  Archive::Map &archive_map = Archive::GetArchiveCache();
+  Archive::Map::iterator pos = archive_map.find(file);
+  // Don't cache a value for "archive_map.end()" below since we might delete an
+  // archive entry...
+  while (pos != archive_map.end() && pos->first == file) {
+    bool match = true;
+    if (arch.IsValid() &&
+        !pos->second->GetArchitecture().IsCompatibleMatch(arch))
+      match = false;
+    else if (file_offset != LLDB_INVALID_OFFSET &&
+             pos->second->GetFileOffset() != file_offset)
+      match = false;
+    if (match) {
+      if (pos->second->GetModificationTime() == time) {
+        return pos->second;
+      } else {
+        // We have a file at the same path with the same architecture whose
+        // modification time doesn't match. It doesn't make sense for us to
+        // continue to use this Big archive since we cache only the object info
+        // which consists of file time info and also the file offset and file
+        // size of any contained objects. Since this information is now out of
+        // date, we won't get the correct information if we go and extract the
+        // file data, so we should remove the old and outdated entry.
+        archive_map.erase(pos);
+        pos = archive_map.find(file);
+        continue; // Continue to next iteration so we don't increment pos
+                  // below...
+      }
+    }
+    ++pos;
+  }
+  return archive_sp;
+}
+
+ObjectContainerBigArchive::Archive::shared_ptr
+ObjectContainerBigArchive::Archive::ParseAndCacheArchiveForFile(
+    const FileSpec &file, const ArchSpec &arch,
+    const llvm::sys::TimePoint<> &time, lldb::offset_t file_offset,
+    DataExtractor &data) {
+  shared_ptr archive_sp(new Archive(arch, time, file_offset, data));
+  if (archive_sp) {
+    const size_t num_objects = archive_sp->ParseObjects();
+    if (num_objects > 0) {
+      std::lock_guard<std::recursive_mutex> guard(
+          Archive::GetArchiveCacheMutex());
+      Archive::GetArchiveCache().insert(std::make_pair(file, archive_sp));
+    } else {
+      archive_sp.reset();
+    }
+  }
+  return archive_sp;
+}
+
+ObjectContainerBigArchive::Archive::Map &
+ObjectContainerBigArchive::Archive::GetArchiveCache() {
+  static Archive::Map g_archive_map;
+  return g_archive_map;
+}
+
+std::recursive_mutex &
+ObjectContainerBigArchive::Archive::GetArchiveCacheMutex() {
+  static std::recursive_mutex g_archive_map_mutex;
+  return g_archive_map_mutex;
+}
+
+void ObjectContainerBigArchive::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance,
+                                GetModuleSpecifications);
+}
+
+void ObjectContainerBigArchive::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+ObjectContainer *ObjectContainerBigArchive::CreateInstance(
+    const lldb::ModuleSP &module_sp, DataBufferSP &data_sp,
+    lldb::offset_t data_offset, const FileSpec *file,
+    lldb::offset_t file_offset, lldb::offset_t length) {
+  ConstString object_name(module_sp->GetObjectName());
+  if (!object_name)
+    return nullptr;
+
+  if (data_sp) {
+    // We have data, which means this is the first 512 bytes of the file Check
+    // to see if the magic bytes match and if they do, read the entire table of
+    // contents for the archive and cache it
+    DataExtractor data;
+    data.SetData(data_sp, data_offset, length);
+    if (file && data_sp && ObjectContainerBigArchive::MagicBytesMatch(data)) {
+      LLDB_SCOPED_TIMERF(
+          "ObjectContainerBigArchive::CreateInstance (module = %s, file = "
+          "%p, file_offset = 0x%8.8" PRIx64 ", file_size = 0x%8.8" PRIx64 ")",
+          module_sp->GetFileSpec().GetPath().c_str(),
+          static_cast<const void *>(file), static_cast<uint64_t>(file_offset),
+          static_cast<uint64_t>(length));
+
+      // Map the entire .a file to be sure that we don't lose any data if the
+      // file gets updated by a new build while this .a file is being used for
+      // debugging
+      DataBufferSP archive_data_sp =
+          FileSystem::Instance().CreateDataBuffer(*file, length, file_offset);
+      if (!archive_data_sp)
+        return nullptr;
+
+      lldb::offset_t archive_data_offset = 0;
+
+      Archive::shared_ptr archive_sp(Archive::FindCachedArchive(
+          *file, module_sp->GetArchitecture(), module_sp->GetModificationTime(),
+          file_offset));
+      std::unique_ptr<ObjectContainerBigArchive> container_up(
+          new ObjectContainerBigArchive(module_sp, archive_data_sp,
+                                        archive_data_offset, file, file_offset,
+                                        length));
+
+      if (container_up) {
+        if (archive_sp) {
+          // We already have this archive in our cache, use it
+          container_up->SetArchive(archive_sp);
+          return container_up.release();
+        } else if (container_up->ParseHeader())
+          return container_up.release();
+      }
+    }
+  } else {
+    // No data, just check for a cached archive
+    Archive::shared_ptr archive_sp(Archive::FindCachedArchive(
+        *file, module_sp->GetArchitecture(), module_sp->GetModificationTime(),
+        file_offset));
+    if (archive_sp) {
+      std::unique_ptr<ObjectContainerBigArchive> container_up(
+          new ObjectContainerBigArchive(module_sp, data_sp, data_offset, file,
+                                        file_offset, length));
+
+      if (container_up) {
+        // We already have this archive in our cache, use it
+        container_up->SetArchive(archive_sp);
+        return container_up.release();
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool ObjectContainerBigArchive::MagicBytesMatch(const DataExtractor &data) {
+  uint32_t offset = 0;
+  const char *armag = (const char *)data.PeekData(offset, (sizeof(llvm::object::BigArchiveMagic) - 1));
+  if (armag && ::strncmp(armag, llvm::object::BigArchiveMagic, (sizeof(llvm::object::BigArchiveMagic) - 1)) == 0)
+    return true;
+  return false;
+}
+
+ObjectContainerBigArchive::ObjectContainerBigArchive(
+    const lldb::ModuleSP &module_sp, DataBufferSP &data_sp,
+    lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+    lldb::offset_t file_offset, lldb::offset_t size)
+    : ObjectContainer(module_sp, file, file_offset, size, data_sp, data_offset),
+      m_archive_sp() {}
+void ObjectContainerBigArchive::SetArchive(Archive::shared_ptr &archive_sp) {
+  m_archive_sp = archive_sp;
+}
+
+ObjectContainerBigArchive::~ObjectContainerBigArchive() = default;
+
+bool ObjectContainerBigArchive::ParseHeader() {
+  if (m_archive_sp.get() == nullptr) {
+    if (m_data.GetByteSize() > 0) {
+      ModuleSP module_sp(GetModule());
+      if (module_sp) {
+        m_archive_sp = Archive::ParseAndCacheArchiveForFile(
+            m_file, module_sp->GetArchitecture(),
+            module_sp->GetModificationTime(), m_offset, m_data);
+      }
+      // Clear the m_data that contains the entire archive data and let our
+      // m_archive_sp hold onto the data.
+      m_data.Clear();
+    }
+  }
+  return m_archive_sp.get() != nullptr;
+}
+
+void ObjectContainerBigArchive::Object::Dump(Stream *s) const {
+  printf("name        = \"%s\"\n", ar_name.GetCString());
+  printf("mtime       = 0x%8.8" PRIx32 "\n", modification_time);
+  printf("size        = 0x%8.8" PRIx32 " (%" PRIu32 ")\n", size, size);
+  printf("file_offset = 0x%16.16" PRIx64 " (%" PRIu64 ")\n", file_offset,
+         file_offset);
+  printf("file_size   = 0x%16.16" PRIx64 " (%" PRIu64 ")\n\n", file_size,
+         file_size);
+}
+
+ObjectFileSP ObjectContainerBigArchive::GetObjectFile(const FileSpec *file) {
+  ModuleSP module_sp(GetModule());
+  if (module_sp) {
+    if (module_sp->GetObjectName() && m_archive_sp) {
+      Object *object = m_archive_sp->FindObject(
+          module_sp->GetObjectName(), module_sp->GetObjectModificationTime());
+      if (object) {
+        lldb::offset_t data_offset = object->file_offset;
+        return ObjectFile::FindPlugin(
+            module_sp, file, m_offset + object->file_offset, object->file_size,
+            m_archive_sp->GetData().GetSharedDataBuffer(), data_offset);
+      }
+    }
+  }
+  return ObjectFileSP();
+}
+
+size_t ObjectContainerBigArchive::GetModuleSpecifications(
+    const lldb_private::FileSpec &file, lldb::DataBufferSP &data_sp,
+    lldb::offset_t data_offset, lldb::offset_t file_offset,
+    lldb::offset_t file_size, lldb_private::ModuleSpecList &specs) {
+
+  // We have data, which means this is the first 512 bytes of the file Check to
+  // see if the magic bytes match and if they do, read the entire table of
+  // contents for the archive and cache it
+  DataExtractor data;
+  data.SetData(data_sp, data_offset, data_sp->GetByteSize());
+  if (!file || !data_sp || !ObjectContainerBigArchive::MagicBytesMatch(data))
+    return 0;
+
+  const size_t initial_count = specs.GetSize();
+  llvm::sys::TimePoint<> file_mod_time = FileSystem::Instance().GetModificationTime(file);
+  Archive::shared_ptr archive_sp(
+      Archive::FindCachedArchive(file, ArchSpec(), file_mod_time, file_offset));
+  bool set_archive_arch = false;
+  if (!archive_sp) {
+    set_archive_arch = true;
+    data_sp =
+        FileSystem::Instance().CreateDataBuffer(file, file_size, file_offset);
+    if (data_sp) {
+      data.SetData(data_sp, 0, data_sp->GetByteSize());
+      archive_sp = Archive::ParseAndCacheArchiveForFile(
+          file, ArchSpec(), file_mod_time, file_offset, data);
+    }
+  }
+
+  if (archive_sp) {
+    const size_t num_objects = archive_sp->GetNumObjects();
+    for (size_t idx = 0; idx < num_objects; ++idx) {
+      const Object *object = archive_sp->GetObjectAtIndex(idx);
+      if (object) {
+        const lldb::offset_t object_file_offset =
+            file_offset + object->file_offset;
+        if (object->file_offset < file_size && file_size > object_file_offset) {
+          if (ObjectFile::GetModuleSpecifications(
+                  file, object_file_offset, file_size - object_file_offset,
+                  specs)) {
+            ModuleSpec &spec =
+                specs.GetModuleSpecRefAtIndex(specs.GetSize() - 1);
+            llvm::sys::TimePoint<> object_mod_time(
+                std::chrono::seconds(object->modification_time));
+            spec.GetObjectName() = object->ar_name;
+            spec.SetObjectOffset(object_file_offset);
+            spec.SetObjectSize(file_size - object_file_offset);
+            spec.GetObjectModificationTime() = object_mod_time;
+          }
+        }
+      }
+    }
+  }
+  const size_t end_count = specs.GetSize();
+  size_t num_specs_added = end_count - initial_count;
+  if (set_archive_arch && num_specs_added > 0) {
+    // The archive was created but we didn't have an architecture so we need to
+    // set it
+    for (size_t i = initial_count; i < end_count; ++i) {
+      ModuleSpec module_spec;
+      if (specs.GetModuleSpecAtIndex(i, module_spec)) {
+        if (module_spec.GetArchitecture().IsValid()) {
+          archive_sp->SetArchitecture(module_spec.GetArchitecture());
+          break;
+        }
+      }
+    }
+  }
+  return num_specs_added;
+}
diff --git a/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.h b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.h
new file mode 100644
index 0000000000000..ad9b814048a87
--- /dev/null
+++ b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.h
@@ -0,0 +1,177 @@
+//===-- ObjectContainerBigArchive.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_OBJECTCONTAINER_BIG_ARCHIVE_OBJECTCONTAINERBIGARCHIVE_H
+#define LLDB_SOURCE_PLUGINS_OBJECTCONTAINER_BIG_ARCHIVE_OBJECTCONTAINERBIGARCHIVE_H
+
+#include "lldb/Core/UniqueCStringMap.h"
+#include "lldb/Symbol/ObjectContainer.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/ConstString.h"
+#include "lldb/Utility/FileSpec.h"
+
+#include "llvm/Support/Chrono.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+
+class ObjectContainerBigArchive : public lldb_private::ObjectContainer {
+public:
+  ObjectContainerBigArchive(const lldb::ModuleSP &module_sp,
+                            lldb::DataBufferSP &data_sp,
+                            lldb::offset_t data_offset,
+                            const lldb_private::FileSpec *file,
+                            lldb::offset_t offset, lldb::offset_t length);
+
+  ~ObjectContainerBigArchive() override;
+
+  // Static Functions
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "big-archive"; }
+
+  static llvm::StringRef GetPluginDescriptionStatic() {
+    return "Big Archive object container reader.";
+  }
+
+  static lldb_private::ObjectContainer *
+  CreateInstance(const lldb::ModuleSP &module_sp, lldb::DataBufferSP &data_sp,
+                 lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+                 lldb::offset_t offset, lldb::offset_t length);
+
+  static size_t GetModuleSpecifications(const lldb_private::FileSpec &file,
+                                        lldb::DataBufferSP &data_sp,
+                                        lldb::offset_t data_offset,
+                                        lldb::offset_t file_offset,
+                                        lldb::offset_t length,
+                                        lldb_private::ModuleSpecList &specs);
+
+  static bool MagicBytesMatch(const lldb_private::DataExtractor &data);
+
+  // Member Functions
+  bool ParseHeader() override;
+
+  size_t GetNumObjects() const override {
+    if (m_archive_sp)
+      return m_archive_sp->GetNumObjects();
+    return 0;
+  }
+
+  lldb::ObjectFileSP GetObjectFile(const lldb_private::FileSpec *file) override;
+
+  // PluginInterface protocol
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+protected:
+  struct Object {
+    Object();
+
+    void Clear();
+
+    lldb::offset_t Extract(const lldb_private::DataExtractor &data,
+                           lldb::offset_t offset);
+    /// Object name in the archive.
+    lldb_private::ConstString ar_name;
+
+    /// Object modification time in the archive.
+    uint32_t modification_time = 0;
+
+    /// Object user id in the archive.
+    uint16_t uid = 0;
+
+    /// Object group id in the archive.
+    uint16_t gid = 0;
+
+    /// Object octal file permissions in the archive.
+    uint16_t mode = 0;
+
+    /// Object size in bytes in the archive.
+    uint32_t size = 0;
+
+    /// File offset in bytes from the beginning of the file of the object data.
+    lldb::offset_t file_offset = 0;
+
+    /// Length of the object data.
+    lldb::offset_t file_size = 0;
+  
+	void Dump(lldb_private::Stream *s) const;
+  };
+
+  class Archive {
+  public:
+    typedef std::shared_ptr<Archive> shared_ptr;
+    typedef std::multimap<lldb_private::FileSpec, shared_ptr> Map;
+
+    Archive(const lldb_private::ArchSpec &arch,
+            const llvm::sys::TimePoint<> &mod_time, lldb::offset_t file_offset,
+            lldb_private::DataExtractor &data);
+
+    ~Archive();
+
+    static Map &GetArchiveCache();
+
+    static std::recursive_mutex &GetArchiveCacheMutex();
+
+    static Archive::shared_ptr FindCachedArchive(
+        const lldb_private::FileSpec &file, const lldb_private::ArchSpec &arch,
+        const llvm::sys::TimePoint<> &mod_time, lldb::offset_t file_offset);
+
+    static Archive::shared_ptr ParseAndCacheArchiveForFile(
+        const lldb_private::FileSpec &file, const lldb_private::ArchSpec &arch,
+        const llvm::sys::TimePoint<> &mod_time, lldb::offset_t file_offset,
+        lldb_private::DataExtractor &data);
+
+    size_t GetNumObjects() const { return m_objects.size(); }
+
+    const Object *GetObjectAtIndex(size_t idx) {
+      if (idx < m_objects.size())
+        return &m_objects[idx];
+      return nullptr;
+    }
+
+    size_t ParseObjects();
+
+    Object *FindObject(lldb_private::ConstString object_name,
+                       const llvm::sys::TimePoint<> &object_mod_time);
+
+    lldb::offset_t GetFileOffset() const { return m_file_offset; }
+
+    const llvm::sys::TimePoint<> &GetModificationTime() {
+      return m_modification_time;
+    }
+
+    const lldb_private::ArchSpec &GetArchitecture() const { return m_arch; }
+
+    void SetArchitecture(const lldb_private::ArchSpec &arch) { m_arch = arch; }
+
+    bool HasNoExternalReferences() const;
+
+    lldb_private::DataExtractor &GetData() { return m_data; }
+
+  protected:
+    typedef lldb_private::UniqueCStringMap<uint32_t> ObjectNameToIndexMap;
+    // Member Variables
+    lldb_private::ArchSpec m_arch;
+    llvm::sys::TimePoint<> m_modification_time;
+    lldb::offset_t m_file_offset;
+    std::vector<Object> m_objects;
+    ObjectNameToIndexMap m_object_name_to_index_map;
+    lldb_private::DataExtractor m_data; ///< The data for this object container
+                                        ///so we don't lose data if the .a files
+                                        ///gets modified
+  };
+
+  void SetArchive(Archive::shared_ptr &archive_sp);
+
+  Archive::shared_ptr m_archive_sp;
+};
+
+#endif // LLDB_SOURCE_PLUGINS_OBJECTCONTAINER_BIG_ARCHIVE_OBJECTCONTAINERBIGARCHIVE_H
diff --git a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
index cda0c8151dd8a..2492798bb13ef 100644
--- a/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectContainer/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(BSD-Archive)
+add_subdirectory(Big-Archive)
 add_subdirectory(Universal-Mach-O)
 add_subdirectory(Mach-O-Fileset)
diff --git a/lldb/source/Plugins/ObjectFile/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
index 773241c8944c8..7abd0c96f4fd7 100644
--- a/lldb/source/Plugins/ObjectFile/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
@@ -6,5 +6,6 @@ add_subdirectory(Mach-O)
 add_subdirectory(Minidump)
 add_subdirectory(PDB)
 add_subdirectory(PECOFF)
+add_subdirectory(XCOFF)
 add_subdirectory(Placeholder)
 add_subdirectory(wasm)
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index ce095bcc48374..bcb6330cbb1f9 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -5673,7 +5673,7 @@ bool ObjectFileMachO::GetCorefileMainBinaryInfo(addr_t &value,
   return false;
 }
 
-bool ObjectFileMachO::GetCorefileThreadExtraInfos(std::vector<tid_t> &tids) {
+bool ObjectFileMachO::GetCorefileThreadExtraInfos(std::vector<lldb::tid_t> &tids) {
   tids.clear();
   ModuleSP module_sp(GetModule());
   if (module_sp) {
@@ -5724,8 +5724,8 @@ bool ObjectFileMachO::GetCorefileThreadExtraInfos(std::vector<tid_t> &tids) {
           return false;
         }
         StructuredData::Dictionary *thread = *maybe_thread;
-        tid_t tid = LLDB_INVALID_THREAD_ID;
-        if (thread->GetValueForKeyAsInteger<tid_t>("thread_id", tid))
+        lldb::tid_t tid = LLDB_INVALID_THREAD_ID;
+        if (thread->GetValueForKeyAsInteger<lldb::tid_t>("thread_id", tid))
           if (tid == 0)
             tid = LLDB_INVALID_THREAD_ID;
         tids.push_back(tid);
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index faa144bfb5f6a..d27cdfc60de85 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -51,7 +51,9 @@ size_t ObjectFileMinidump::GetModuleSpecifications(
     const lldb_private::FileSpec &file, lldb::DataBufferSP &data_sp,
     lldb::offset_t data_offset, lldb::offset_t file_offset,
     lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
+#if !defined(__AIX__)
   specs.Clear();
+#endif
   return 0;
 }
 
diff --git a/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp b/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
index f0832dbf07347..75cc54e4f0d48 100644
--- a/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
+++ b/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
@@ -116,18 +116,31 @@ size_t ObjectFilePDB::GetModuleSpecifications(
   ModuleSpec module_spec(file);
   llvm::BumpPtrAllocator allocator;
   std::unique_ptr<PDBFile> pdb_file = loadPDBFile(file.GetPath(), allocator);
-  if (!pdb_file)
+  if (!pdb_file){
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
+  }
 
   auto info_stream = pdb_file->getPDBInfoStream();
   if (!info_stream) {
     llvm::consumeError(info_stream.takeError());
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
   }
   auto dbi_stream = pdb_file->getPDBDbiStream();
   if (!dbi_stream) {
     llvm::consumeError(dbi_stream.takeError());
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
   }
 
   lldb_private::UUID &uuid = module_spec.GetUUID();
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index bda691ade8af0..db8fa78043fdc 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -252,8 +252,13 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
     lldb::offset_t data_offset, lldb::offset_t file_offset,
     lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
   const size_t initial_count = specs.GetSize();
-  if (!data_sp || !ObjectFilePECOFF::MagicBytesMatch(data_sp))
+  if (!data_sp || !ObjectFilePECOFF::MagicBytesMatch(data_sp)){
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
+  }
 
   Log *log = GetLog(LLDBLog::Object);
 
@@ -266,12 +271,21 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
   if (!binary) {
     LLDB_LOG_ERROR(log, binary.takeError(),
                    "Failed to create binary for file ({1}): {0}", file);
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
   }
 
   auto *COFFObj = llvm::dyn_cast<llvm::object::COFFObjectFile>(binary->get());
-  if (!COFFObj)
+  if (!COFFObj){
+#if !defined(__AIX__)
     return initial_count;
+#else
+    return specs.GetSize() - initial_count;
+#endif
+  }
 
   ModuleSpec module_spec(file);
   ArchSpec &spec = module_spec.GetArchitecture();
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/XCOFF/CMakeLists.txt
new file mode 100644
index 0000000000000..8840248574c88
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_lldb_library(lldbPluginObjectFileXCOFF PLUGIN
+  ObjectFileXCOFF.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbSymbol
+    lldbTarget
+  LINK_COMPONENTS
+    BinaryFormat
+    Object
+    Support
+  )
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
new file mode 100644
index 0000000000000..a4d9ea295b4c3
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -0,0 +1,780 @@
+//===-- ObjectFileXCOFF.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjectFileXCOFF.h"
+
+#include <algorithm>
+#include <cassert>
+#include <unordered_map>
+#include <string.h>
+
+#include "lldb/Utility/FileSpecList.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Core/Progress.h"
+#include "lldb/Core/Section.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/LZMA.h"
+#include "lldb/Symbol/DWARFCallFrameInfo.h"
+#include "lldb/Symbol/SymbolContext.h"
+#include "lldb/Target/SectionLoadList.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/RangeMap.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Utility/Stream.h"
+#include "lldb/Utility/Timer.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/Object/Decompressor.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(ObjectFileXCOFF)
+
+char ObjectFileXCOFF::ID;
+
+// FIXME: target 64bit at this moment.
+
+// Static methods.
+void ObjectFileXCOFF::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance,
+                                CreateMemoryInstance, GetModuleSpecifications);
+}
+
+void ObjectFileXCOFF::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+bool UGLY_FLAG_FOR_AIX __attribute__((weak)) = false;
+
+ObjectFile *ObjectFileXCOFF::CreateInstance(const lldb::ModuleSP &module_sp,
+                                          DataBufferSP data_sp,
+                                          lldb::offset_t data_offset,
+                                          const lldb_private::FileSpec *file,
+                                          lldb::offset_t file_offset,
+                                          lldb::offset_t length) {
+  if (!data_sp) {
+    data_sp = MapFileData(*file, length, file_offset);
+    if (!data_sp)
+      return nullptr;
+    data_offset = 0;
+  }
+
+  if (!ObjectFileXCOFF::MagicBytesMatch(data_sp, data_offset, length))
+    return nullptr;
+
+  // Update the data to contain the entire file if it doesn't already
+  if (data_sp->GetByteSize() < length) {
+    data_sp = MapFileData(*file, length, file_offset);
+    if (!data_sp)
+      return nullptr;
+    data_offset = 0;
+  }
+  auto objfile_up = std::make_unique<ObjectFileXCOFF>(
+      module_sp, data_sp, data_offset, file, file_offset, length);
+  if (!objfile_up)
+    return nullptr;
+
+  // Cache xcoff binary.
+  if (!objfile_up->CreateBinary())
+    return nullptr;
+
+  if (!objfile_up->ParseHeader())
+    //FIXME objfile leak
+    return nullptr;
+
+  UGLY_FLAG_FOR_AIX = true;
+  return objfile_up.release();
+}
+
+bool ObjectFileXCOFF::CreateBinary() {
+  if (m_binary)
+    return true;
+
+  Log *log = GetLog(LLDBLog::Object);
+
+  auto binary = llvm::object::XCOFFObjectFile::createObjectFile(llvm::MemoryBufferRef(
+      toStringRef(m_data.GetData()), m_file.GetFilename().GetStringRef()),
+    file_magic::xcoff_object_64);
+  if (!binary) {
+    LLDB_LOG_ERROR(log, binary.takeError(),
+                   "Failed to create binary for file ({1}): {0}", m_file);
+    return false;
+  }
+
+  // Make sure we only handle COFF format.
+  m_binary =
+      llvm::unique_dyn_cast<llvm::object::XCOFFObjectFile>(std::move(*binary));
+  if (!m_binary)
+    return false;
+
+  LLDB_LOG(log, "this = {0}, module = {1} ({2}), file = {3}, binary = {4}",
+           this, GetModule().get(), GetModule()->GetSpecificationDescription(),
+           m_file.GetPath(), m_binary.get());
+  return true;
+}
+
+ObjectFile *ObjectFileXCOFF::CreateMemoryInstance(
+    const lldb::ModuleSP &module_sp, WritableDataBufferSP data_sp,
+    const lldb::ProcessSP &process_sp, lldb::addr_t header_addr) {
+  return nullptr;
+}
+
+size_t ObjectFileXCOFF::GetModuleSpecifications(
+    const lldb_private::FileSpec &file, lldb::DataBufferSP &data_sp,
+    lldb::offset_t data_offset, lldb::offset_t file_offset,
+    lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
+  const size_t initial_count = specs.GetSize();
+
+  if (ObjectFileXCOFF::MagicBytesMatch(data_sp, 0, data_sp->GetByteSize())) {
+    ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+    ModuleSpec spec(file, arch_spec);
+    spec.GetArchitecture().SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
+    specs.Append(spec);
+  }
+  return specs.GetSize() - initial_count;
+}
+
+static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) {
+  switch (magic) {
+  /* TODO: 32bit not supported yet
+  case XCOFF::XCOFF32:
+    return sizeof(struct llvm::object::XCOFFFileHeader32);
+  */
+
+  case XCOFF::XCOFF64:
+    return sizeof(struct llvm::object::XCOFFFileHeader64);
+    break;
+
+  default:
+    break;
+  }
+  return 0;
+}
+
+bool ObjectFileXCOFF::MagicBytesMatch(DataBufferSP &data_sp,
+                                    lldb::addr_t data_offset,
+                                    lldb::addr_t data_length) {
+  lldb_private::DataExtractor data; 
+  data.SetData(data_sp, data_offset, data_length);
+  lldb::offset_t offset = 0;
+  uint16_t magic = data.GetU16(&offset);
+  return XCOFFHeaderSizeFromMagic(magic) != 0;
+}
+
+bool ObjectFileXCOFF::ParseHeader() {
+  ModuleSP module_sp(GetModule());
+  if (module_sp) {
+    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+    m_sect_headers.clear();
+    lldb::offset_t offset = 0;
+
+    if (ParseXCOFFHeader(m_data, &offset, m_xcoff_header)) {
+      m_data.SetAddressByteSize(GetAddressByteSize());
+      if (m_xcoff_header.auxhdrsize > 0)
+        ParseXCOFFOptionalHeader(m_data, &offset);
+      ParseSectionHeaders(offset);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFHeader(lldb_private::DataExtractor &data,
+                                       lldb::offset_t *offset_ptr,
+                                       xcoff_header_t &xcoff_header) {
+  //FIXME: data.ValidOffsetForDataOfSize
+  xcoff_header.magic = data.GetU16(offset_ptr);
+  xcoff_header.nsects = data.GetU16(offset_ptr);
+  xcoff_header.modtime = data.GetU32(offset_ptr);
+  xcoff_header.symoff = data.GetU64(offset_ptr);
+  xcoff_header.auxhdrsize = data.GetU16(offset_ptr);
+  xcoff_header.flags = data.GetU16(offset_ptr);
+  xcoff_header.nsyms = data.GetU32(offset_ptr);
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
+                                               lldb::offset_t *offset_ptr) {
+  lldb::offset_t init_offset = *offset_ptr;
+  //FIXME: data.ValidOffsetForDataOfSize
+  m_xcoff_aux_header.AuxMagic = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.Version = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ReservedForDebugger = data.GetU32(offset_ptr);
+  m_xcoff_aux_header.TextStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.DataStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.TOCAnchorAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfEntryPoint = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTOC = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfLoader = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ModuleType = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.CpuFlag = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.CpuType = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.DataPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.StackPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.FlagAndTDataAlignment = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.InitDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.BssDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.EntryPointAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxStackSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.XCOFF64Flag = data.GetU16(offset_ptr);
+  lldb::offset_t last_offset = *offset_ptr;
+  if ((last_offset - init_offset) < m_xcoff_header.auxhdrsize)
+    *offset_ptr += (m_xcoff_header.auxhdrsize - (last_offset - init_offset));
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseSectionHeaders(
+    uint32_t section_header_data_offset) {
+  const uint32_t nsects = m_xcoff_header.nsects;
+  m_sect_headers.clear();
+
+  if (nsects > 0) {
+    const size_t section_header_byte_size = nsects * m_binary->getSectionHeaderSize();
+    lldb_private::DataExtractor section_header_data =
+        ReadImageData(section_header_data_offset, section_header_byte_size);
+
+    lldb::offset_t offset = 0;
+    //FIXME: section_header_data.ValidOffsetForDataOfSize
+    m_sect_headers.resize(nsects);
+
+    for (uint32_t idx = 0; idx < nsects; ++idx) {
+      const void *name_data = section_header_data.GetData(&offset, 8);
+      if (name_data) {
+        memcpy(m_sect_headers[idx].name, name_data, 8);
+        m_sect_headers[idx].phyaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].vmaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].size = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].offset = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].reloff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].lineoff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].nreloc = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].nline = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].flags = section_header_data.GetU32(&offset);
+        offset += 4;
+      } else {
+        offset += (m_binary->getSectionHeaderSize() - 8);
+      }
+    }
+  }
+
+  return !m_sect_headers.empty();
+}
+
+lldb_private::DataExtractor ObjectFileXCOFF::ReadImageData(uint32_t offset, size_t size) {
+  if (!size)
+    return {};
+
+  if (m_data.ValidOffsetForDataOfSize(offset, size))
+    return lldb_private::DataExtractor(m_data, offset, size);
+
+  assert(0);
+  ProcessSP process_sp(m_process_wp.lock());
+  lldb_private::DataExtractor data;
+  if (process_sp) {
+    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
+    Status readmem_error;
+    size_t bytes_read =
+        process_sp->ReadMemory(offset, data_up->GetBytes(),
+                               data_up->GetByteSize(), readmem_error);
+    if (bytes_read == size) {
+      DataBufferSP buffer_sp(data_up.release());
+      data.SetData(buffer_sp, 0, buffer_sp->GetByteSize());
+    }
+  }
+  return data;
+}
+
+bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
+                                   bool value_is_offset) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (section_sp && !section_sp->IsThreadSpecific()) {
+          bool use_offset = false;
+          if (strcmp(section_sp->GetName().AsCString(), ".text") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".data") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".bss") == 0)
+            use_offset = true;
+
+          if (target.GetSectionLoadList().SetSectionLoadAddress(
+                  section_sp, (use_offset ?
+                  (section_sp->GetFileOffset() + value) : (section_sp->GetFileAddress() + value))))
+            ++num_loaded_sections;
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+bool ObjectFileXCOFF::SetLoadAddressByType(Target &target, lldb::addr_t value,
+                                   bool value_is_offset, int type_id) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (type_id == 1 && section_sp && strcmp(section_sp->GetName().AsCString(), ".text") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileOffset() + value))
+              ++num_loaded_sections;
+          }
+        } else if (type_id == 2 && section_sp && strcmp(section_sp->GetName().AsCString(), ".data") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileAddress() + value))
+              ++num_loaded_sections;
+          }
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+ByteOrder ObjectFileXCOFF::GetByteOrder() const {
+  return eByteOrderBig;
+}
+
+bool ObjectFileXCOFF::IsExecutable() const {
+  return true;
+}
+
+uint32_t ObjectFileXCOFF::GetAddressByteSize() const {
+  if (m_xcoff_header.magic == XCOFF::XCOFF64)
+    return 8;
+  else if (m_xcoff_header.magic == XCOFF::XCOFF32)
+    return 4;
+  return 4;
+}
+
+AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
+  return AddressClass::eUnknown;
+}
+
+lldb::SymbolType ObjectFileXCOFF::MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  if (sym_type == llvm::object::SymbolRef::ST_Function)
+    return lldb::eSymbolTypeCode;
+  else if (sym_type == llvm::object::SymbolRef::ST_Data)
+    return lldb::eSymbolTypeData;
+  return lldb::eSymbolTypeInvalid;
+}
+
+void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  SectionList *sect_list = GetSectionList();
+  const uint32_t num_syms = m_xcoff_header.nsyms;
+  uint32_t sidx = 0;
+  if (num_syms > 0 && m_xcoff_header.symoff > 0) {
+    const uint32_t symbol_size = XCOFF::SymbolTableEntrySize;
+    const size_t symbol_data_size = num_syms * symbol_size;
+    lldb_private::DataExtractor symtab_data =
+        ReadImageData(m_xcoff_header.symoff, symbol_data_size);
+
+    lldb::offset_t offset = 0;
+    std::string symbol_name;
+    Symbol *symbols = lldb_symtab.Resize(num_syms);
+    llvm::object::symbol_iterator SI = m_binary->symbol_begin();
+    for (uint32_t i = 0; i < num_syms; ++i, ++SI) {
+      xcoff_symbol_t symbol;
+      const uint32_t symbol_offset = offset;
+      symbol.value = symtab_data.GetU64(&offset);
+      symbol.offset = symtab_data.GetU32(&offset);
+      Expected<StringRef> symbol_name_or_err = m_binary->getStringTableEntry(symbol.offset);
+      if (!symbol_name_or_err) {
+        consumeError(symbol_name_or_err.takeError());
+        return;
+      }
+      StringRef symbol_name_str = symbol_name_or_err.get();
+      symbol_name.assign(symbol_name_str.data());
+      symbol.sect = symtab_data.GetU16(&offset);
+      symbol.type = symtab_data.GetU16(&offset);
+      symbol.storage = symtab_data.GetU8(&offset);
+      symbol.naux = symtab_data.GetU8(&offset);
+      // Allow C_HIDEXT TOC symbol, and check others.
+      if (symbol.storage == XCOFF::C_HIDEXT && strcmp(symbol_name.c_str(), "TOC") != 0) {
+        if (symbol.naux == 0)
+          continue;
+        if (symbol.naux > 1) {
+          i += symbol.naux;
+          offset += symbol.naux * symbol_size;
+          continue;
+        }
+        /* Allow XCOFF::C_HIDEXT with following SMC and AT:
+          StorageMappingClass: XMC_PR (0x0)
+          Auxiliary Type: AUX_CSECT (0xFB)
+        */
+        xcoff_sym_csect_aux_entry_t symbol_aux;
+        symbol_aux.section_or_len_low_byte = symtab_data.GetU32(&offset);
+        symbol_aux.parameter_hash_index = symtab_data.GetU32(&offset);
+        symbol_aux.type_check_sect_num = symtab_data.GetU16(&offset);
+        symbol_aux.symbol_alignment_and_type = symtab_data.GetU8(&offset);
+        symbol_aux.storage_mapping_class = symtab_data.GetU8(&offset);
+        symbol_aux.section_or_len_high_byte = symtab_data.GetU32(&offset);
+        symbol_aux.pad = symtab_data.GetU8(&offset);
+        symbol_aux.aux_type = symtab_data.GetU8(&offset);
+        offset -= symbol.naux * symbol_size;
+        if (symbol_aux.storage_mapping_class != XCOFF::XMC_PR || symbol_aux.aux_type != XCOFF::AUX_CSECT) {
+          i += symbol.naux;
+          offset += symbol.naux * symbol_size;
+          continue;
+        }
+      }
+      // Remove the dot prefix for demangle
+      if (symbol_name_str.size() > 1 && symbol_name_str.data()[0] == '.') {
+        symbols[sidx].GetMangled().SetValue(ConstString(symbol_name.c_str() + 1));
+      } else {
+        symbols[sidx].GetMangled().SetValue(ConstString(symbol_name.c_str()));
+      }
+      if ((int16_t)symbol.sect >= 1) {
+        Address symbol_addr(sect_list->GetSectionAtIndex((size_t)(symbol.sect - 1)),
+                            (symbol.value - sect_list->GetSectionAtIndex((size_t)(symbol.sect - 1))->GetFileAddress()));
+        symbols[sidx].GetAddressRef() = symbol_addr;
+
+        Expected<llvm::object::SymbolRef::Type> sym_type_or_err = SI->getType();
+        if (!sym_type_or_err) {
+          consumeError(sym_type_or_err.takeError());
+          return;
+        }
+        symbols[sidx].SetType(MapSymbolType(sym_type_or_err.get()));
+      }
+      ++sidx;
+
+      if (symbol.naux > 0) {
+        i += symbol.naux;
+        offset += symbol.naux * symbol_size;
+      }
+    }
+    lldb_symtab.Resize(sidx);
+  }
+}
+
+bool ObjectFileXCOFF::IsStripped() {
+  return false;
+}
+
+void ObjectFileXCOFF::CreateSections(SectionList &unified_section_list) {
+  if (m_sections_up)
+    return;
+  m_sections_up = std::make_unique<SectionList>();
+  ModuleSP module_sp(GetModule());
+  if (module_sp) {
+    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+
+    const uint32_t nsects = m_sect_headers.size();
+    ModuleSP module_sp(GetModule());
+    for (uint32_t idx = 0; idx < nsects; ++idx) {
+      llvm::StringRef sect_name = GetSectionName(m_sect_headers[idx]);
+      ConstString const_sect_name(sect_name);
+      SectionType section_type = GetSectionType(sect_name, m_sect_headers[idx]);
+
+      SectionSP section_sp(new Section(
+          module_sp,       // Module to which this section belongs
+          this,            // Object file to which this section belongs
+          idx + 1,         // Section ID is the 1 based section index.
+          const_sect_name, // Name of this section
+          section_type,
+          m_sect_headers[idx].vmaddr, // File VM address == addresses as
+                                          // they are found in the object file
+          m_sect_headers[idx].size,     // VM size in bytes of this section
+          m_sect_headers[idx].offset, // Offset to the data for this section in the file
+          m_sect_headers[idx].size, // Size in bytes of this section as found in the file
+          0, // FIXME: alignment
+          m_sect_headers[idx].flags));      // Flags for this section
+
+      // FIXME
+      uint32_t permissions = 0;
+      permissions |= ePermissionsReadable;
+      if (m_sect_headers[idx].flags & (XCOFF::STYP_DATA | XCOFF::STYP_BSS))
+        permissions |= ePermissionsWritable;
+      if (m_sect_headers[idx].flags & XCOFF::STYP_TEXT)
+        permissions |= ePermissionsExecutable;
+      section_sp->SetPermissions(permissions);
+
+      m_sections_up->AddSection(section_sp);
+      unified_section_list.AddSection(section_sp);
+    }
+  }
+}
+
+llvm::StringRef ObjectFileXCOFF::GetSectionName(const section_header_t &sect) {
+  llvm::StringRef hdr_name(sect.name, std::size(sect.name));
+  hdr_name = hdr_name.split('\0').first;
+  if (hdr_name.consume_front("/")) {
+    lldb::offset_t stroff;
+    if (!to_integer(hdr_name, stroff, 10))
+      return "";
+    lldb::offset_t string_file_offset =
+        m_xcoff_header.symoff + (m_xcoff_header.nsyms * static_cast<lldb::offset_t>(XCOFF::SymbolTableEntrySize)) + stroff;
+    if (const char *name = m_data.GetCStr(&string_file_offset))
+      return name;
+    return "";
+  }
+  return hdr_name;
+}
+
+SectionType ObjectFileXCOFF::GetSectionType(llvm::StringRef sect_name,
+                                             const section_header_t &sect) {
+  if (sect.flags & XCOFF::STYP_TEXT)
+    return eSectionTypeCode;
+  if (sect.flags & XCOFF::STYP_DATA)
+    return eSectionTypeData;
+  if (sect.flags & XCOFF::STYP_BSS)
+    return eSectionTypeZeroFill;
+  if (sect.flags & XCOFF::STYP_DWARF) {
+    SectionType section_type =
+      llvm::StringSwitch<SectionType>(sect_name)
+      .Case(".dwinfo", eSectionTypeDWARFDebugInfo)
+      .Case(".dwline", eSectionTypeDWARFDebugLine)
+      .Case(".dwabrev", eSectionTypeDWARFDebugAbbrev)
+      .Default(eSectionTypeInvalid);
+
+    if (section_type != eSectionTypeInvalid)
+      return section_type;
+  }
+  return eSectionTypeOther;
+}
+
+void ObjectFileXCOFF::Dump(Stream *s) {
+}
+
+ArchSpec ObjectFileXCOFF::GetArchitecture() {
+  ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+  return arch_spec;
+}
+
+UUID ObjectFileXCOFF::GetUUID() {
+  return UUID();
+}
+
+std::optional<FileSpec> ObjectFileXCOFF::GetDebugLink() {
+  return std::nullopt;
+}
+
+uint32_t ObjectFileXCOFF::ParseDependentModules() {
+  ModuleSP module_sp(GetModule());
+  if (!module_sp)
+    return 0;
+
+  std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+  if (m_deps_filespec)
+    return m_deps_filespec->GetSize();
+
+  // Cache coff binary if it is not done yet.
+  if (!CreateBinary())
+    return 0;
+
+  Log *log = GetLog(LLDBLog::Object);
+  LLDB_LOG(log, "this = {0}, module = {1} ({2}), file = {3}, binary = {4}",
+           this, GetModule().get(), GetModule()->GetSpecificationDescription(),
+           m_file.GetPath(), m_binary.get());
+
+  m_deps_filespec = FileSpecList();
+
+  auto ImportFilesOrError = m_binary->getImportFileTable();
+  if (!ImportFilesOrError) {
+    consumeError(ImportFilesOrError.takeError());
+    return 0;
+  }
+
+#if 0
+  StringRef ImportFileTable = ImportFilesOrError.get();
+  const char *CurrentStr = ImportFileTable.data();
+  const char *TableEnd = ImportFileTable.end();
+  const char *Basename = nullptr;
+
+  for (size_t StrIndex = 0; CurrentStr < TableEnd;
+       ++StrIndex, CurrentStr += strlen(CurrentStr) + 1) {
+    if (StrIndex >= 3 && StrIndex % 3 == 1) {
+      // base_name
+      llvm::StringRef dll_name(CurrentStr);
+      Basename = CurrentStr;
+
+      // At this moment we only have the base name of the DLL. The full path can
+      // only be seen after the dynamic loading.  Our best guess is Try to get it
+      // with the help of the object file's directory.
+      llvm::SmallString<128> dll_fullpath;
+      FileSpec dll_specs(dll_name);
+      // FIXME: hack to get libc.a loaded
+      if (strcmp(CurrentStr, "libc.a") == 0) {
+        dll_specs.GetDirectory().SetString("/usr/lib");
+      } else {
+        dll_specs.GetDirectory().SetString(m_file.GetDirectory().GetCString());
+      }
+
+      if (!llvm::sys::fs::real_path(dll_specs.GetPath(), dll_fullpath))
+        //m_deps_filespec->EmplaceBack(dll_fullpath);
+        m_deps_filespec->EmplaceBack("/usr/lib/libc.a(shr_64.o)");
+      else {
+        // Known DLLs or DLL not found in the object file directory.
+        m_deps_filespec->EmplaceBack(dll_name);
+      }
+    } else if (StrIndex >= 3 && StrIndex % 3 == 2) {
+      // archive_member_name
+      if (strcmp(CurrentStr, "") == 0) {
+        continue;
+      }
+      assert(strcmp(Basename, "") != 0);
+      std::map<std::string, std::vector<std::string>>::iterator iter = m_deps_base_members.find(std::string(Basename));
+      if (iter == m_deps_base_members.end()) {
+        m_deps_base_members[std::string(Basename)] = std::vector<std::string>();
+        iter = m_deps_base_members.find(std::string(Basename));
+      }
+      iter->second.push_back(std::string(CurrentStr));
+    }
+  }
+#endif
+  return m_deps_filespec->GetSize();
+}
+
+uint32_t ObjectFileXCOFF::GetDependentModules(FileSpecList &files) {
+  auto num_modules = ParseDependentModules();
+  auto original_size = files.GetSize();
+
+  for (unsigned i = 0; i < num_modules; ++i)
+    files.AppendIfUnique(m_deps_filespec->GetFileSpecAtIndex(i));
+
+  return files.GetSize() - original_size;
+}
+
+Address ObjectFileXCOFF::GetImageInfoAddress(Target *target) {
+  return Address();
+}
+
+lldb_private::Address ObjectFileXCOFF::GetEntryPointAddress() {
+  if (m_entry_point_address.IsValid())
+    return m_entry_point_address;
+
+  if (!ParseHeader() || !IsExecutable())
+    return m_entry_point_address;
+
+  SectionList *section_list = GetSectionList();
+  addr_t vm_addr = m_xcoff_aux_header.EntryPointAddr;
+  SectionSP section_sp(
+      section_list->FindSectionContainingFileAddress(vm_addr));
+  if (section_sp) {
+    lldb::offset_t offset_ptr = section_sp->GetFileOffset() + (vm_addr - section_sp->GetFileAddress());
+    vm_addr = m_data.GetU64(&offset_ptr);
+  }
+
+  if (!section_list)
+    m_entry_point_address.SetOffset(vm_addr);
+  else
+    m_entry_point_address.ResolveAddressUsingFileSections(vm_addr,
+                                                          section_list);
+
+  return m_entry_point_address;
+}
+
+lldb_private::Address ObjectFileXCOFF::GetBaseAddress() {
+  return lldb_private::Address();
+}
+
+ObjectFile::Type ObjectFileXCOFF::CalculateType() {
+  if (m_xcoff_header.flags & XCOFF::F_EXEC)
+    return eTypeExecutable;
+  else if (m_xcoff_header.flags & XCOFF::F_SHROBJ)
+    return eTypeSharedLibrary;
+  return eTypeUnknown;
+}
+
+ObjectFile::Strata ObjectFileXCOFF::CalculateStrata() {
+  return eStrataUnknown;
+}
+
+llvm::StringRef
+ObjectFileXCOFF::StripLinkerSymbolAnnotations(llvm::StringRef symbol_name) const {
+  return llvm::StringRef();
+}
+
+void ObjectFileXCOFF::RelocateSection(lldb_private::Section *section)
+{
+}
+
+std::vector<ObjectFile::LoadableData>
+ObjectFileXCOFF::GetLoadableData(Target &target) {
+  std::vector<LoadableData> loadables;
+  return loadables;
+}
+
+lldb::WritableDataBufferSP
+ObjectFileXCOFF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
+                                   uint64_t Offset) {
+  return FileSystem::Instance().CreateWritableDataBuffer(file.GetPath(), Size,
+                                                         Offset);
+}
+
+ObjectFileXCOFF::ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
+                             DataBufferSP data_sp, lldb::offset_t data_offset,
+                             const FileSpec *file, lldb::offset_t file_offset,
+                             lldb::offset_t length)
+    : ObjectFile(module_sp, file, file_offset, length, data_sp, data_offset),
+      m_xcoff_header(), m_sect_headers(), m_deps_filespec(), m_deps_base_members(),
+      m_entry_point_address() {
+  ::memset(&m_xcoff_header, 0, sizeof(m_xcoff_header));
+  if (file)
+    m_file = *file;
+}
+
+ObjectFileXCOFF::ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
+                             DataBufferSP header_data_sp,
+                             const lldb::ProcessSP &process_sp,
+                             addr_t header_addr)
+    : ObjectFile(module_sp, process_sp, header_addr, header_data_sp),
+      m_xcoff_header(), m_sect_headers(), m_deps_filespec(), m_deps_base_members(),
+      m_entry_point_address() {
+  ::memset(&m_xcoff_header, 0, sizeof(m_xcoff_header));
+}
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
new file mode 100644
index 0000000000000..5a12d16886489
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
@@ -0,0 +1,243 @@
+//===-- ObjectFileXCOFF.h --------------------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILEXCOFF_H
+#define LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILEXCOFF_H
+
+#include <cstdint>
+
+#include <vector>
+
+#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/UUID.h"
+#include "lldb/lldb-private.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+
+/// \class ObjectFileXCOFF
+/// Generic XCOFF object file reader.
+///
+/// This class provides a generic XCOFF (32/64 bit) reader plugin implementing
+/// the ObjectFile protocol.
+class ObjectFileXCOFF : public lldb_private::ObjectFile {
+public:
+  // Static Functions
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "xcoff"; }
+
+  static llvm::StringRef GetPluginDescriptionStatic() {
+    return "XCOFF object file reader.";
+  }
+
+  static lldb_private::ObjectFile *
+  CreateInstance(const lldb::ModuleSP &module_sp, lldb::DataBufferSP data_sp,
+                 lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+                 lldb::offset_t file_offset, lldb::offset_t length);
+
+  static lldb_private::ObjectFile *CreateMemoryInstance(
+      const lldb::ModuleSP &module_sp, lldb::WritableDataBufferSP data_sp,
+      const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
+
+  static size_t GetModuleSpecifications(const lldb_private::FileSpec &file,
+                                        lldb::DataBufferSP &data_sp,
+                                        lldb::offset_t data_offset,
+                                        lldb::offset_t file_offset,
+                                        lldb::offset_t length,
+                                        lldb_private::ModuleSpecList &specs);
+
+  static bool MagicBytesMatch(lldb::DataBufferSP &data_sp, lldb::addr_t offset,
+                              lldb::addr_t length);
+
+  static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type);
+
+  // PluginInterface protocol
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  // LLVM RTTI support
+  static char ID;
+  bool isA(const void *ClassID) const override {
+    return ClassID == &ID || ObjectFile::isA(ClassID);
+  }
+  static bool classof(const ObjectFile *obj) { return obj->isA(&ID); }
+
+  // ObjectFile Protocol.
+  bool ParseHeader() override;
+
+  bool SetLoadAddress(lldb_private::Target &target, lldb::addr_t value,
+                      bool value_is_offset) override;
+
+  bool SetLoadAddressByType(lldb_private::Target &target, lldb::addr_t value,
+                              bool value_is_offset, int type_id) override;
+
+  lldb::ByteOrder GetByteOrder() const override;
+
+  bool IsExecutable() const override;
+
+  uint32_t GetAddressByteSize() const override;
+
+  lldb_private::AddressClass GetAddressClass(lldb::addr_t file_addr) override;
+
+  void ParseSymtab(lldb_private::Symtab &symtab) override;
+
+  bool IsStripped() override;
+
+  void CreateSections(lldb_private::SectionList &unified_section_list) override;
+
+  void Dump(lldb_private::Stream *s) override;
+
+  lldb_private::ArchSpec GetArchitecture() override;
+
+  lldb_private::UUID GetUUID() override;
+
+  /// Return the contents of the .gnu_debuglink section, if the object file
+  /// contains it.
+  std::optional<lldb_private::FileSpec> GetDebugLink();
+
+  uint32_t GetDependentModules(lldb_private::FileSpecList &files) override;
+
+  lldb_private::Address
+  GetImageInfoAddress(lldb_private::Target *target) override;
+
+  lldb_private::Address GetEntryPointAddress() override;
+
+  lldb_private::Address GetBaseAddress() override;
+
+  ObjectFile::Type CalculateType() override;
+
+  ObjectFile::Strata CalculateStrata() override;
+
+  llvm::StringRef
+  StripLinkerSymbolAnnotations(llvm::StringRef symbol_name) const override;
+
+  void RelocateSection(lldb_private::Section *section) override;
+
+  lldb_private::DataExtractor ReadImageData(uint32_t offset, size_t size);
+
+  ObjectFileXCOFF(const lldb::ModuleSP &module_sp, lldb::DataBufferSP data_sp,
+                lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+                lldb::offset_t offset, lldb::offset_t length);
+
+  ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
+                lldb::DataBufferSP header_data_sp,
+                const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
+
+protected:
+
+  typedef struct xcoff_header {
+    uint16_t magic;
+    uint16_t nsects;
+    uint32_t modtime;
+    uint64_t symoff;
+    uint32_t nsyms;
+    uint16_t auxhdrsize;
+    uint16_t flags;
+  } xcoff_header_t;
+
+  typedef struct xcoff_aux_header {
+    uint16_t AuxMagic;
+    uint16_t Version;
+    uint32_t ReservedForDebugger;
+    uint64_t TextStartAddr;
+    uint64_t DataStartAddr;
+    uint64_t TOCAnchorAddr;
+    uint16_t SecNumOfEntryPoint;
+    uint16_t SecNumOfText;
+    uint16_t SecNumOfData;
+    uint16_t SecNumOfTOC;
+    uint16_t SecNumOfLoader;
+    uint16_t SecNumOfBSS;
+    uint16_t MaxAlignOfText;
+    uint16_t MaxAlignOfData;
+    uint16_t ModuleType;
+    uint8_t CpuFlag;
+    uint8_t CpuType;
+    uint8_t TextPageSize;
+    uint8_t DataPageSize;
+    uint8_t StackPageSize;
+    uint8_t FlagAndTDataAlignment;
+    uint64_t TextSize;
+    uint64_t InitDataSize;
+    uint64_t BssDataSize;
+    uint64_t EntryPointAddr;
+    uint64_t MaxStackSize;
+    uint64_t MaxDataSize;
+    uint16_t SecNumOfTData;
+    uint16_t SecNumOfTBSS;
+    uint16_t XCOFF64Flag;
+  } xcoff_aux_header_t;
+
+  typedef struct section_header {
+    char name[8];
+    uint64_t phyaddr; // Physical Addr
+    uint64_t vmaddr;  // Virtual Addr
+    uint64_t size;    // Section size
+    uint64_t offset;  // File offset to raw data
+    uint64_t reloff;  // Offset to relocations
+    uint64_t lineoff; // Offset to line table entries
+    uint32_t nreloc;  // Number of relocation entries
+    uint32_t nline;   // Number of line table entries
+    uint32_t flags;
+  } section_header_t;
+
+  typedef struct xcoff_symbol {
+    uint64_t value;
+    uint32_t offset;
+    uint16_t sect;
+    uint16_t type;
+    uint8_t storage;
+    uint8_t naux;
+  } xcoff_symbol_t;
+
+  typedef struct xcoff_sym_csect_aux_entry {
+    uint32_t section_or_len_low_byte;
+    uint32_t parameter_hash_index;
+    uint16_t type_check_sect_num;
+    uint8_t symbol_alignment_and_type;
+    uint8_t storage_mapping_class;
+    uint32_t section_or_len_high_byte;
+    uint8_t pad;
+    uint8_t aux_type;
+  } xcoff_sym_csect_aux_entry_t;
+
+  static bool ParseXCOFFHeader(lldb_private::DataExtractor &data,
+                              lldb::offset_t *offset_ptr,
+                              xcoff_header_t &xcoff_header);
+  bool ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
+                                lldb::offset_t *offset_ptr);
+  bool ParseSectionHeaders(uint32_t offset);
+
+  std::vector<LoadableData>
+  GetLoadableData(lldb_private::Target &target) override;
+
+  static lldb::WritableDataBufferSP
+  MapFileDataWritable(const lldb_private::FileSpec &file, uint64_t Size,
+                      uint64_t Offset);
+  llvm::StringRef GetSectionName(const section_header_t &sect);
+  static lldb::SectionType GetSectionType(llvm::StringRef sect_name,
+                                          const section_header_t &sect);
+
+  uint32_t ParseDependentModules();
+  typedef std::vector<section_header_t> SectionHeaderColl;
+
+private:
+  bool CreateBinary();
+
+  xcoff_header_t m_xcoff_header;
+  xcoff_aux_header_t m_xcoff_aux_header;
+  SectionHeaderColl m_sect_headers;
+  std::unique_ptr<llvm::object::XCOFFObjectFile> m_binary;
+  lldb_private::Address m_entry_point_address;
+  std::optional<lldb_private::FileSpecList> m_deps_filespec;
+  std::map<std::string, std::vector<std::string>> m_deps_base_members;
+};
+
+#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
index e026ffefd645e..106e38b6e25ae 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
@@ -227,7 +227,7 @@ ThreadSP OperatingSystemPython::CreateThreadFromThreadInfo(
     ThreadList &old_thread_list, std::vector<bool> &core_used_map,
     bool *did_create_ptr) {
   ThreadSP thread_sp;
-  tid_t tid = LLDB_INVALID_THREAD_ID;
+  lldb::tid_t tid = LLDB_INVALID_THREAD_ID;
   if (!thread_dict.GetValueForKeyAsInteger("tid", tid))
     return ThreadSP();
 
diff --git a/lldb/source/Plugins/Platform/AIX/CMakeLists.txt b/lldb/source/Plugins/Platform/AIX/CMakeLists.txt
new file mode 100644
index 0000000000000..85ff0a315eabd
--- /dev/null
+++ b/lldb/source/Plugins/Platform/AIX/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_definitions("-D_ALL_SOURCE")
+
+add_lldb_library(lldbPluginPlatformAIX PLUGIN
+  PlatformAIX.cpp
+
+   LINK_LIBS
+    lldbBreakpoint
+    lldbCore
+    lldbHost
+    lldbInterpreter
+    lldbTarget
+    lldbPluginPlatformPOSIX
+  )
diff --git a/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
new file mode 100644
index 0000000000000..b6b08b73bec41
--- /dev/null
+++ b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
@@ -0,0 +1,471 @@
+//===-- PlatformAIX.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PlatformAIX.h"
+#include "lldb/Host/Config.h"
+
+#include <cstdio>
+#if LLDB_ENABLE_POSIX
+#include <sys/utsname.h>
+#endif
+
+#include "Utility/ARM64_DWARF_Registers.h"
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/HostInfo.h"
+#include "lldb/Symbol/UnwindPlan.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/State.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Utility/StreamString.h"
+
+// Define these constants from AIX mman.h for use when targeting remote aix
+// systems even when host has different values.
+
+#if defined(__AIX__)
+#include <sys/mman.h>
+#endif
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::platform_aix;
+
+LLDB_PLUGIN_DEFINE(PlatformAIX)
+
+static uint32_t g_initialize_count = 0;
+
+
+PlatformSP PlatformAIX::CreateInstance(bool force, const ArchSpec *arch) {
+  Log *log = GetLog(LLDBLog::Platform);
+  LLDB_LOG(log, "force = {0}, arch=({1}, {2})", force,
+           arch ? arch->GetArchitectureName() : "<null>",
+           arch ? arch->GetTriple().getTriple() : "<null>");
+
+  bool create = force;
+  if (!create && arch && arch->IsValid()) {
+    const llvm::Triple &triple = arch->GetTriple();
+    switch (triple.getOS()) {
+    case llvm::Triple::AIX:
+      create = true;
+      break;
+
+    default:
+      break;
+    }
+  }
+
+  LLDB_LOG(log, "create = {0}", create);
+  if (create) {
+    return PlatformSP(new PlatformAIX(false));
+  }
+  return PlatformSP();
+}
+
+llvm::StringRef PlatformAIX::GetPluginDescriptionStatic(bool is_host) {
+  if (is_host)
+    return "Local AIX user platform plug-in.";
+  return "Remote AIX user platform plug-in.";
+}
+
+void PlatformAIX::Initialize() {
+  PlatformPOSIX::Initialize();
+
+  if (g_initialize_count++ == 0) {
+#if defined(__AIX__)
+    PlatformSP default_platform_sp(new PlatformAIX(true));
+    default_platform_sp->SetSystemArchitecture(HostInfo::GetArchitecture());
+    Platform::SetHostPlatform(default_platform_sp);
+#endif
+    PluginManager::RegisterPlugin(
+        PlatformAIX::GetPluginNameStatic(false),
+        PlatformAIX::GetPluginDescriptionStatic(false),
+        PlatformAIX::CreateInstance, nullptr);
+  }
+}
+
+void PlatformAIX::Terminate() {
+  if (g_initialize_count > 0) {
+    if (--g_initialize_count == 0) {
+      PluginManager::UnregisterPlugin(PlatformAIX::CreateInstance);
+    }
+  }
+
+  PlatformPOSIX::Terminate();
+}
+
+/// Default Constructor
+PlatformAIX::PlatformAIX(bool is_host)
+    : PlatformPOSIX(is_host) // This is the local host platform
+{
+  if (is_host) {
+    ArchSpec hostArch = HostInfo::GetArchitecture(HostInfo::eArchKindDefault);
+    m_supported_architectures.push_back(hostArch);
+    if (hostArch.GetTriple().isArch64Bit()) {
+      m_supported_architectures.push_back(
+          HostInfo::GetArchitecture(HostInfo::eArchKind32));
+    }
+  } else {
+    m_supported_architectures = CreateArchList(
+        {llvm::Triple::x86_64, llvm::Triple::x86, llvm::Triple::arm,
+         llvm::Triple::aarch64, llvm::Triple::mips64, llvm::Triple::mips64,
+         llvm::Triple::hexagon, llvm::Triple::mips, llvm::Triple::mips64el,
+         llvm::Triple::mipsel, llvm::Triple::systemz},
+        llvm::Triple::AIX);
+  }
+}
+
+std::vector<ArchSpec>
+PlatformAIX::GetSupportedArchitectures(const ArchSpec &process_host_arch) {
+  if (m_remote_platform_sp)
+    return m_remote_platform_sp->GetSupportedArchitectures(process_host_arch);
+  return m_supported_architectures;
+}
+
+void PlatformAIX::GetStatus(Stream &strm) {
+  Platform::GetStatus(strm);
+
+#if LLDB_ENABLE_POSIX
+  // Display local kernel information only when we are running in host mode.
+  // Otherwise, we would end up printing non-AIX information (when running on
+  // Mac OS for example).
+  if (IsHost()) {
+    struct utsname un;
+
+    if (uname(&un))
+      return;
+
+    strm.Printf("    Kernel: %s\n", un.sysname);
+    strm.Printf("   Release: %s\n", un.release);
+    strm.Printf("   Version: %s\n", un.version);
+  }
+#endif
+}
+
+uint32_t
+PlatformAIX::GetResumeCountForLaunchInfo(ProcessLaunchInfo &launch_info) {
+  uint32_t resume_count = 0;
+
+  // Always resume past the initial stop when we use eLaunchFlagDebug
+  if (launch_info.GetFlags().Test(eLaunchFlagDebug)) {
+    // Resume past the stop for the final exec into the true inferior.
+    ++resume_count;
+  }
+
+  // If we're not launching a shell, we're done.
+  const FileSpec &shell = launch_info.GetShell();
+  if (!shell)
+    return resume_count;
+
+  std::string shell_string = shell.GetPath();
+  // We're in a shell, so for sure we have to resume past the shell exec.
+  ++resume_count;
+
+  // Figure out what shell we're planning on using.
+  const char *shell_name = strrchr(shell_string.c_str(), '/');
+  if (shell_name == nullptr)
+    shell_name = shell_string.c_str();
+  else
+    shell_name++;
+
+  if (strcmp(shell_name, "csh") == 0 || strcmp(shell_name, "tcsh") == 0 ||
+      strcmp(shell_name, "zsh") == 0 || strcmp(shell_name, "sh") == 0) {
+    // These shells seem to re-exec themselves.  Add another resume.
+    ++resume_count;
+  }
+
+  return resume_count;
+}
+
+bool PlatformAIX::CanDebugProcess() {
+  if (IsHost()) {
+    return true;
+  } else {
+    // If we're connected, we can debug.
+    return IsConnected();
+  }
+}
+
+void PlatformAIX::CalculateTrapHandlerSymbolNames() {
+  m_trap_handlers.push_back(ConstString("_sigtramp"));
+  m_trap_handlers.push_back(ConstString("__kernel_rt_sigreturn"));
+  m_trap_handlers.push_back(ConstString("__restore_rt"));
+}
+
+static lldb::UnwindPlanSP GetAArch64TrapHanlderUnwindPlan(ConstString name) {
+  UnwindPlanSP unwind_plan_sp;
+  if (name != "__kernel_rt_sigreturn")
+    return unwind_plan_sp;
+
+  UnwindPlan::RowSP row = std::make_shared<UnwindPlan::Row>();
+  row->SetOffset(0);
+
+  // In the signal trampoline frame, sp points to an rt_sigframe[1], which is:
+  //  - 128-byte siginfo struct
+  //  - ucontext struct:
+  //     - 8-byte long (uc_flags)
+  //     - 8-byte pointer (uc_link)
+  //     - 24-byte stack_t
+  //     - 128-byte signal set
+  //     - 8 bytes of padding because sigcontext has 16-byte alignment
+  //     - sigcontext/mcontext_t
+  // [1]
+  // https://github.com/torvalds/linux/blob/master/arch/arm64/kernel/signal.c
+  int32_t offset = 128 + 8 + 8 + 24 + 128 + 8;
+  // Then sigcontext[2] is:
+  // - 8 byte fault address
+  // - 31 8 byte registers
+  // - 8 byte sp
+  // - 8 byte pc
+  // [2]
+  // https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/sigcontext.h
+
+  // Skip fault address
+  offset += 8;
+  row->GetCFAValue().SetIsRegisterPlusOffset(arm64_dwarf::sp, offset);
+
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x0, 0 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x1, 1 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x2, 2 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x3, 3 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x4, 4 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x5, 5 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x6, 6 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x7, 7 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x8, 8 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x9, 9 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x10, 10 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x11, 11 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x12, 12 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x13, 13 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x14, 14 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x15, 15 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x16, 16 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x17, 17 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x18, 18 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x19, 19 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x20, 20 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x21, 21 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x22, 22 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x23, 23 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x24, 24 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x25, 25 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x26, 26 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x27, 27 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x28, 28 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::fp, 29 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::x30, 30 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::sp, 31 * 8, false);
+  row->SetRegisterLocationToAtCFAPlusOffset(arm64_dwarf::pc, 32 * 8, false);
+
+  // The sigcontext may also contain floating point and SVE registers.
+  // However this would require a dynamic unwind plan so they are not included
+  // here.
+
+  unwind_plan_sp = std::make_shared<UnwindPlan>(eRegisterKindDWARF);
+  unwind_plan_sp->AppendRow(row);
+  unwind_plan_sp->SetSourceName("AArch64 AIX sigcontext");
+  unwind_plan_sp->SetSourcedFromCompiler(eLazyBoolYes);
+  // Because sp is the same throughout the function
+  unwind_plan_sp->SetUnwindPlanValidAtAllInstructions(eLazyBoolYes);
+  unwind_plan_sp->SetUnwindPlanForSignalTrap(eLazyBoolYes);
+
+  return unwind_plan_sp;
+}
+
+lldb::UnwindPlanSP
+PlatformAIX::GetTrapHandlerUnwindPlan(const llvm::Triple &triple,
+                                        ConstString name) {
+  if (triple.isAArch64())
+    return GetAArch64TrapHanlderUnwindPlan(name);
+
+  return {};
+}
+
+MmapArgList PlatformAIX::GetMmapArgumentList(const ArchSpec &arch,
+                                               addr_t addr, addr_t length,
+                                               unsigned prot, unsigned flags,
+                                               addr_t fd, addr_t offset) {
+#if defined(__AIX__)
+  unsigned flags_platform = MAP_VARIABLE | MAP_PRIVATE | MAP_ANONYMOUS;
+#else
+  unsigned flags_platform = 0;
+#endif
+  MmapArgList args({addr, length, prot, flags_platform, fd, offset});
+  return args;
+}
+
+CompilerType PlatformAIX::GetSiginfoType(const llvm::Triple &triple) {
+  if (!m_type_system_up)
+    m_type_system_up.reset(new TypeSystemClang("siginfo", triple));
+  TypeSystemClang *ast = m_type_system_up.get();
+
+  bool si_errno_then_code = true;
+
+  switch (triple.getArch()) {
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el:
+    // mips has si_code and si_errno swapped
+    si_errno_then_code = false;
+    break;
+  default:
+    break;
+  }
+
+  // generic types
+  CompilerType int_type = ast->GetBasicType(eBasicTypeInt);
+  CompilerType uint_type = ast->GetBasicType(eBasicTypeUnsignedInt);
+  CompilerType short_type = ast->GetBasicType(eBasicTypeShort);
+  CompilerType long_type = ast->GetBasicType(eBasicTypeLong);
+  CompilerType voidp_type = ast->GetBasicType(eBasicTypeVoid).GetPointerType();
+
+  // platform-specific types
+  CompilerType &pid_type = int_type;
+  CompilerType &uid_type = uint_type;
+  CompilerType &clock_type = long_type;
+  CompilerType &band_type = long_type;
+
+  CompilerType sigval_type = ast->CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_sigval_t",
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
+  ast->StartTagDeclarationDefinition(sigval_type);
+  ast->AddFieldToRecordType(sigval_type, "sival_int", int_type,
+                            lldb::eAccessPublic, 0);
+  ast->AddFieldToRecordType(sigval_type, "sival_ptr", voidp_type,
+                            lldb::eAccessPublic, 0);
+  ast->CompleteTagDeclarationDefinition(sigval_type);
+
+  CompilerType sigfault_bounds_type = ast->CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
+  ast->StartTagDeclarationDefinition(sigfault_bounds_type);
+  ast->AddFieldToRecordType(sigfault_bounds_type, "_addr_bnd",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"_lower", voidp_type},
+                                         {"_upper", voidp_type},
+                                     }),
+                            lldb::eAccessPublic, 0);
+  ast->AddFieldToRecordType(sigfault_bounds_type, "_pkey", uint_type,
+                            lldb::eAccessPublic, 0);
+  ast->CompleteTagDeclarationDefinition(sigfault_bounds_type);
+
+  // siginfo_t
+  CompilerType siginfo_type = ast->CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_siginfo_t",
+      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
+  ast->StartTagDeclarationDefinition(siginfo_type);
+  ast->AddFieldToRecordType(siginfo_type, "si_signo", int_type,
+                            lldb::eAccessPublic, 0);
+
+  if (si_errno_then_code) {
+    ast->AddFieldToRecordType(siginfo_type, "si_errno", int_type,
+                              lldb::eAccessPublic, 0);
+    ast->AddFieldToRecordType(siginfo_type, "si_code", int_type,
+                              lldb::eAccessPublic, 0);
+  } else {
+    ast->AddFieldToRecordType(siginfo_type, "si_code", int_type,
+                              lldb::eAccessPublic, 0);
+    ast->AddFieldToRecordType(siginfo_type, "si_errno", int_type,
+                              lldb::eAccessPublic, 0);
+  }
+
+  // the structure is padded on 64-bit arches to fix alignment
+  if (triple.isArch64Bit())
+    ast->AddFieldToRecordType(siginfo_type, "__pad0", int_type,
+                              lldb::eAccessPublic, 0);
+
+  // union used to hold the signal data
+  CompilerType union_type = ast->CreateRecordType(
+      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
+      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
+  ast->StartTagDeclarationDefinition(union_type);
+
+  ast->AddFieldToRecordType(
+      union_type, "_kill",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_pid", pid_type},
+                                         {"si_uid", uid_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->AddFieldToRecordType(
+      union_type, "_timer",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_tid", int_type},
+                                         {"si_overrun", int_type},
+                                         {"si_sigval", sigval_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->AddFieldToRecordType(
+      union_type, "_rt",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_pid", pid_type},
+                                         {"si_uid", uid_type},
+                                         {"si_sigval", sigval_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->AddFieldToRecordType(
+      union_type, "_sigchld",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_pid", pid_type},
+                                         {"si_uid", uid_type},
+                                         {"si_status", int_type},
+                                         {"si_utime", clock_type},
+                                         {"si_stime", clock_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->AddFieldToRecordType(
+      union_type, "_sigfault",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_addr", voidp_type},
+                                         {"si_addr_lsb", short_type},
+                                         {"_bounds", sigfault_bounds_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->AddFieldToRecordType(
+      union_type, "_sigpoll",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"si_band", band_type},
+                                         {"si_fd", int_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  // NB: SIGSYS is not present on ia64 but we don't seem to support that
+  ast->AddFieldToRecordType(
+      union_type, "_sigsys",
+      ast->CreateStructForIdentifier(ConstString(),
+                                     {
+                                         {"_call_addr", voidp_type},
+                                         {"_syscall", int_type},
+                                         {"_arch", uint_type},
+                                     }),
+      lldb::eAccessPublic, 0);
+
+  ast->CompleteTagDeclarationDefinition(union_type);
+  ast->AddFieldToRecordType(siginfo_type, "_sifields", union_type,
+                            lldb::eAccessPublic, 0);
+
+  ast->CompleteTagDeclarationDefinition(siginfo_type);
+  return siginfo_type;
+}
diff --git a/lldb/source/Plugins/Platform/AIX/PlatformAIX.h b/lldb/source/Plugins/Platform/AIX/PlatformAIX.h
new file mode 100644
index 0000000000000..3ae8089a48d71
--- /dev/null
+++ b/lldb/source/Plugins/Platform/AIX/PlatformAIX.h
@@ -0,0 +1,74 @@
+//===-- PlatformAIX.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PLATFORM_AIX_PLATFORMAIX_H
+#define LLDB_SOURCE_PLUGINS_PLATFORM_AIX_PLATFORMAIX_H
+
+#include "Plugins/Platform/POSIX/PlatformPOSIX.h"
+#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
+
+namespace lldb_private {
+namespace platform_aix {
+
+class PlatformAIX : public PlatformPOSIX {
+public:
+  PlatformAIX(bool is_host);
+
+  static void Initialize();
+
+  static void Terminate();
+
+  // lldb_private::PluginInterface functions
+  static lldb::PlatformSP CreateInstance(bool force, const ArchSpec *arch);
+
+  static llvm::StringRef GetPluginNameStatic(bool is_host) {
+    return is_host ? Platform::GetHostPlatformName() : "remote-AIX";
+  }
+
+  static llvm::StringRef GetPluginDescriptionStatic(bool is_host);
+
+  llvm::StringRef GetPluginName() override {
+    return GetPluginNameStatic(IsHost());
+  }
+
+  // lldb_private::Platform functions
+  llvm::StringRef GetDescription() override {
+    return GetPluginDescriptionStatic(IsHost());
+  }
+
+  void GetStatus(Stream &strm) override;
+
+  std::vector<ArchSpec>
+  GetSupportedArchitectures(const ArchSpec &process_host_arch) override;
+
+  uint32_t GetResumeCountForLaunchInfo(ProcessLaunchInfo &launch_info) override;
+
+  bool CanDebugProcess() override;
+
+  void CalculateTrapHandlerSymbolNames() override;
+
+  lldb::UnwindPlanSP GetTrapHandlerUnwindPlan(const llvm::Triple &triple,
+                                              ConstString name) override;
+
+  MmapArgList GetMmapArgumentList(const ArchSpec &arch, lldb::addr_t addr,
+                                  lldb::addr_t length, unsigned prot,
+                                  unsigned flags, lldb::addr_t fd,
+                                  lldb::addr_t offset) override;
+
+  CompilerType GetSiginfoType(const llvm::Triple &triple) override;
+
+  std::vector<ArchSpec> m_supported_architectures;
+
+private:
+  std::unique_ptr<TypeSystemClang> m_type_system_up;
+};
+
+} // namespace platform_AIX
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_PLATFORM_AIX_PLATFORMAIX_H
diff --git a/lldb/source/Plugins/Platform/CMakeLists.txt b/lldb/source/Plugins/Platform/CMakeLists.txt
index 6869587f917eb..9d0afd97cff85 100644
--- a/lldb/source/Plugins/Platform/CMakeLists.txt
+++ b/lldb/source/Plugins/Platform/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(OpenBSD)
 add_subdirectory(POSIX)
 add_subdirectory(QemuUser)
 add_subdirectory(Windows)
+add_subdirectory(AIX)
diff --git a/lldb/source/Plugins/Process/AIX/CMakeLists.txt b/lldb/source/Plugins/Process/AIX/CMakeLists.txt
new file mode 100644
index 0000000000000..e9d83266f5857
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_definitions("-D_ALL_SOURCE")
+
+add_lldb_library(lldbPluginProcessAIX
+  NativeProcessAIX.cpp
+  NativeRegisterContextAIX.cpp
+  NativeRegisterContextAIX_ppc64.cpp
+  NativeThreadAIX.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbSymbol
+    lldbTarget
+    lldbUtility
+    lldbPluginProcessPOSIX
+    lldbPluginProcessUtility
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
new file mode 100644
index 0000000000000..882f20d30a3bf
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -0,0 +1,2048 @@
+//===-- NativeProcessAIX.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NativeProcessAIX.h"
+
+#include <cerrno>
+#include <cstdint>
+#include <cstring>
+#include <unistd.h>
+
+#include <fstream>
+#include <mutex>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "NativeThreadAIX.h"
+#include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
+//#include "Plugins/Process/Utility/LinuxProcMaps.h"
+//#include "Procfs.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Host/Host.h"
+#include "lldb/Host/HostProcess.h"
+#include "lldb/Host/ProcessLaunchInfo.h"
+#include "lldb/Host/PseudoTerminal.h"
+#include "lldb/Host/ThreadLauncher.h"
+#include "lldb/Host/common/NativeRegisterContext.h"
+#include "lldb/Host/aix/Ptrace.h"
+//#include "lldb/Host/linux/Host.h"
+//#include "lldb/Host/linux/Uio.h"
+#include "lldb/Host/posix/ProcessLauncherPosixFork.h"
+#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Utility/LLDBAssert.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/RegisterValue.h"
+#include "lldb/Utility/State.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Utility/StringExtractor.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Threading.h"
+
+#include <sys/reg.h>
+#include <sys/ptrace.h>
+#include <sys/ldr.h>
+#include <sys/socket.h>
+//#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#ifdef __aarch64__
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+// Support hardware breakpoints in case it has not been defined
+#ifndef TRAP_HWBKPT
+#define TRAP_HWBKPT 4
+#endif
+
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::process_aix;
+using namespace llvm;
+
+// Private bits we only need internally.
+
+static bool ProcessVmReadvSupported() {
+  static bool is_supported;
+  static llvm::once_flag flag;
+
+  llvm::call_once(flag, [] {
+    Log *log = GetLog(POSIXLog::Process);
+
+    uint32_t source = 0x47424742;
+    uint32_t dest = 0;
+
+    struct iovec local, remote;
+    remote.iov_base = &source;
+    local.iov_base = &dest;
+    remote.iov_len = local.iov_len = sizeof source;
+
+#if 0
+    // We shall try if cross-process-memory reads work by attempting to read a
+    // value from our own process.
+    ssize_t res = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
+    is_supported = (res == sizeof(source) && source == dest);
+    if (is_supported)
+      LLDB_LOG(log,
+               "Detected kernel support for process_vm_readv syscall. "
+               "Fast memory reads enabled.");
+    else
+      LLDB_LOG(log,
+               "syscall process_vm_readv failed (error: {0}). Fast memory "
+               "reads disabled.",
+               llvm::sys::StrError());
+#endif
+  });
+
+  return is_supported;
+}
+
+static void MaybeLogLaunchInfo(const ProcessLaunchInfo &info) {
+  Log *log = GetLog(POSIXLog::Process);
+  if (!log)
+    return;
+
+  if (const FileAction *action = info.GetFileActionForFD(STDIN_FILENO))
+    LLDB_LOG(log, "setting STDIN to '{0}'", action->GetFileSpec());
+  else
+    LLDB_LOG(log, "leaving STDIN as is");
+
+  if (const FileAction *action = info.GetFileActionForFD(STDOUT_FILENO))
+    LLDB_LOG(log, "setting STDOUT to '{0}'", action->GetFileSpec());
+  else
+    LLDB_LOG(log, "leaving STDOUT as is");
+
+  if (const FileAction *action = info.GetFileActionForFD(STDERR_FILENO))
+    LLDB_LOG(log, "setting STDERR to '{0}'", action->GetFileSpec());
+  else
+    LLDB_LOG(log, "leaving STDERR as is");
+
+  int i = 0;
+  for (const char **args = info.GetArguments().GetConstArgumentVector(); *args;
+       ++args, ++i)
+    LLDB_LOG(log, "arg {0}: '{1}'", i, *args);
+}
+
+static void DisplayBytes(StreamString &s, void *bytes, uint32_t count) {
+  uint8_t *ptr = (uint8_t *)bytes;
+  const uint32_t loop_count = std::min<uint32_t>(DEBUG_PTRACE_MAXBYTES, count);
+  for (uint32_t i = 0; i < loop_count; i++) {
+    s.Printf("[%x]", *ptr);
+    ptr++;
+  }
+}
+
+static void PtraceDisplayBytes(int &req, void *data, size_t data_size) {
+  Log *log = GetLog(POSIXLog::Ptrace);
+  if (!log)
+    return;
+  StreamString buf;
+
+  switch (req) {
+  case PTRACE_POKETEXT: {
+    DisplayBytes(buf, &data, 8);
+    LLDB_LOGV(log, "PTRACE_POKETEXT {0}", buf.GetData());
+    break;
+  }
+  case PTRACE_POKEDATA: {
+    DisplayBytes(buf, &data, 8);
+    LLDB_LOGV(log, "PTRACE_POKEDATA {0}", buf.GetData());
+    break;
+  }
+  case PTRACE_POKEUSER: {
+    DisplayBytes(buf, &data, 8);
+    LLDB_LOGV(log, "PTRACE_POKEUSER {0}", buf.GetData());
+    break;
+  }
+  case PTRACE_SETREGS: {
+    DisplayBytes(buf, data, data_size);
+    LLDB_LOGV(log, "PTRACE_SETREGS {0}", buf.GetData());
+    break;
+  }
+  case PTRACE_SETFPREGS: {
+    DisplayBytes(buf, data, data_size);
+    LLDB_LOGV(log, "PTRACE_SETFPREGS {0}", buf.GetData());
+    break;
+  }
+#if 0
+  case PTRACE_SETSIGINFO: {
+    DisplayBytes(buf, data, sizeof(siginfo_t));
+    LLDB_LOGV(log, "PTRACE_SETSIGINFO {0}", buf.GetData());
+    break;
+  }
+#endif
+  case PTRACE_SETREGSET: {
+    // Extract iov_base from data, which is a pointer to the struct iovec
+    DisplayBytes(buf, *(void **)data, data_size);
+    LLDB_LOGV(log, "PTRACE_SETREGSET {0}", buf.GetData());
+    break;
+  }
+  default: {}
+  }
+}
+
+static constexpr unsigned k_ptrace_word_size = sizeof(void *);
+static_assert(sizeof(long) >= k_ptrace_word_size,
+              "Size of long must be larger than ptrace word size");
+
+// Simple helper function to ensure flags are enabled on the given file
+// descriptor.
+static Status EnsureFDFlags(int fd, int flags) {
+  Status error;
+
+  int status = fcntl(fd, F_GETFL);
+  if (status == -1) {
+    error.SetErrorToErrno();
+    return error;
+  }
+
+  if (fcntl(fd, F_SETFL, status | flags) == -1) {
+    error.SetErrorToErrno();
+    return error;
+  }
+
+  return error;
+}
+
+#if 0
+static llvm::Error AddPtraceScopeNote(llvm::Error original_error) {
+  Expected<int> ptrace_scope = GetPtraceScope();
+  if (auto E = ptrace_scope.takeError()) {
+    Log *log = GetLog(POSIXLog::Process);
+    LLDB_LOG(log, "error reading value of ptrace_scope: {0}", E);
+
+    // The original error is probably more interesting than not being able to
+    // read or interpret ptrace_scope.
+    return original_error;
+  }
+
+  // We only have suggestions to provide for 1-3.
+  switch (*ptrace_scope) {
+  case 1:
+  case 2:
+    return llvm::createStringError(
+        std::error_code(errno, std::generic_category()),
+        "The current value of ptrace_scope is %d, which can cause ptrace to "
+        "fail to attach to a running process. To fix this, run:\n"
+        "\tsudo sysctl -w kernel.yama.ptrace_scope=0\n"
+        "For more information, see: "
+        "https://www.kernel.org/doc/Documentation/security/Yama.txt.",
+        *ptrace_scope);
+  case 3:
+    return llvm::createStringError(
+        std::error_code(errno, std::generic_category()),
+        "The current value of ptrace_scope is 3, which will cause ptrace to "
+        "fail to attach to a running process. This value cannot be changed "
+        "without rebooting.\n"
+        "For more information, see: "
+        "https://www.kernel.org/doc/Documentation/security/Yama.txt.");
+  case 0:
+  default:
+    return original_error;
+  }
+}
+#endif
+
+NativeProcessAIX::Manager::Manager(MainLoop &mainloop)
+    : NativeProcessProtocol::Manager(mainloop) {
+  Status status;
+  m_sigchld_handle = mainloop.RegisterSignal(
+      SIGCHLD, [this](MainLoopBase &) { SigchldHandler(); }, status);
+  assert(m_sigchld_handle && status.Success());
+}
+
+// Public Static Methods
+
+llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
+NativeProcessAIX::Manager::Launch(ProcessLaunchInfo &launch_info,
+                                    NativeDelegate &native_delegate) {
+  Log *log = GetLog(POSIXLog::Process);
+
+  MaybeLogLaunchInfo(launch_info);
+
+  Status status;
+  ::pid_t pid = ProcessLauncherPosixFork()
+                    .LaunchProcess(launch_info, status)
+                    .GetProcessId();
+  LLDB_LOG(log, "pid = {0:x}", pid);
+  if (status.Fail()) {
+    LLDB_LOG(log, "failed to launch process: {0}", status);
+    return status.ToError();
+  }
+
+  // Wait for the child process to trap on its call to execve.
+  int wstatus = 0;
+  ::pid_t wpid = llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &wstatus, 0);
+  assert(wpid == pid);
+  UNUSED_IF_ASSERT_DISABLED(wpid);
+  if (!WIFSTOPPED(wstatus)) {
+    LLDB_LOG(log, "Could not sync with inferior process: wstatus={1}",
+             WaitStatus::Decode(wstatus));
+    return llvm::make_error<StringError>("Could not sync with inferior process",
+                                         llvm::inconvertibleErrorCode());
+  }
+  LLDB_LOG(log, "inferior started, now in stopped state");
+
+  ProcessInstanceInfo Info;
+  if (!Host::GetProcessInfo(pid, Info)) {
+      return llvm::make_error<StringError>("Cannot get process architectrue",
+                                            llvm::inconvertibleErrorCode());
+  } 
+  /*llvm::Expected<ArchSpec> arch_or =
+      NativeRegisterContextAIX::DetermineArchitecture(pid);
+  if (!arch_or)
+    return arch_or.takeError();*/
+
+ // Set the architecture to the exe architecture.
+  LLDB_LOG(log, "pid = {0}, detected architecture {1}", pid, 
+          Info.GetArchitecture().GetArchitectureName());
+
+  return std::unique_ptr<NativeProcessAIX>(new NativeProcessAIX(
+      pid, launch_info.GetPTY().ReleasePrimaryFileDescriptor(), native_delegate,
+      Info.GetArchitecture(), *this, {pid}));
+}
+
+llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
+NativeProcessAIX::Manager::Attach(
+    lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate) { 
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "pid = {0:x}", pid);
+
+  ProcessInstanceInfo Info;
+  if (!Host::GetProcessInfo(pid, Info)) {
+      return llvm::make_error<StringError>("Cannot get process architectrue",
+                                            llvm::inconvertibleErrorCode());
+  } 
+  auto tids_or = NativeProcessAIX::Attach(pid);
+  if (!tids_or)
+    return tids_or.takeError();
+#if 0
+  ArrayRef<::pid_t> tids = *tids_or;
+  llvm::Expected<ArchSpec> arch_or =
+      NativeRegisterContextAIX::DetermineArchitecture(tids[0]);
+  if (!arch_or)
+    return arch_or.takeError();
+#endif
+
+  return std::unique_ptr<NativeProcessAIX>(
+      new NativeProcessAIX(pid, -1, native_delegate, Info.GetArchitecture(), *this, *tids_or));
+}
+
+lldb::addr_t NativeProcessAIX::GetSharedLibraryInfoAddress() {
+  // punt on this for now
+  return LLDB_INVALID_ADDRESS;
+}
+
+NativeProcessAIX::Extension
+NativeProcessAIX::Manager::GetSupportedExtensions() const {
+  NativeProcessAIX::Extension supported =
+      Extension::multiprocess | Extension::fork | Extension::vfork |
+      Extension::pass_signals | Extension::auxv | Extension::libraries_svr4 |
+      Extension::siginfo_read;
+
+#ifdef __aarch64__
+  // At this point we do not have a process so read auxv directly.
+  if ((getauxval(AT_HWCAP2) & HWCAP2_MTE))
+    supported |= Extension::memory_tagging;
+#endif
+
+  return supported;
+}
+
+static std::optional<std::pair<lldb::pid_t, WaitStatus>> WaitPid() {
+  Log *log = GetLog(POSIXLog::Process);
+
+  int status;
+  ::pid_t wait_pid = llvm::sys::RetryAfterSignal(
+      -1, ::waitpid, -1, &status, /*__WALL | __WNOTHREAD |*/ WNOHANG);
+
+  if (wait_pid == 0)
+    return std::nullopt;
+
+  if (wait_pid == -1) {
+    Status error(errno, eErrorTypePOSIX);
+    LLDB_LOG(log, "waitpid(-1, &status, _) failed: {1}", error);
+    return std::nullopt;
+  }
+
+  WaitStatus wait_status = WaitStatus::Decode(status);
+
+  LLDB_LOG(log, "waitpid(-1, &status, _) = {0}, status = {1}", wait_pid,
+           wait_status);
+  return std::make_pair(wait_pid, wait_status);
+}
+
+void NativeProcessAIX::Manager::SigchldHandler() {
+  Log *log = GetLog(POSIXLog::Process);
+  while (true) {
+    auto wait_result = WaitPid();
+    if (!wait_result)
+      return;
+    lldb::pid_t pid = wait_result->first;
+    WaitStatus status = wait_result->second;
+
+    // Ask each process whether it wants to handle the event. Each event should
+    // be handled by exactly one process, but thread creation events require
+    // special handling.
+    // Thread creation consists of two events (one on the parent and one on the
+    // child thread) and they can arrive in any order nondeterministically. The
+    // parent event carries the information about the child thread, but not
+    // vice-versa. This means that if the child event arrives first, it may not
+    // be handled by any process (because it doesn't know the thread belongs to
+    // it).
+    bool handled = llvm::any_of(m_processes, [&](NativeProcessAIX *process) {
+      return process->TryHandleWaitStatus(pid, status);
+    });
+    if (!handled) {
+      if (status.type == WaitStatus::Stop && status.status == SIGSTOP) {
+        // Store the thread creation event for later collection.
+        m_unowned_threads.insert(pid);
+      } else {
+        LLDB_LOG(log, "Ignoring waitpid event {0} for pid {1}", status, pid);
+      }
+    }
+  }
+}
+
+void NativeProcessAIX::Manager::CollectThread(::pid_t tid) {
+  Log *log = GetLog(POSIXLog::Process);
+
+  if (m_unowned_threads.erase(tid))
+    return; // We've encountered this thread already.
+
+  // The TID is not tracked yet, let's wait for it to appear.
+  int status = -1;
+  LLDB_LOG(log,
+           "received clone event for tid {0}. tid not tracked yet, "
+           "waiting for it to appear...",
+           tid);
+  ::pid_t wait_pid =
+      llvm::sys::RetryAfterSignal(-1, ::waitpid, tid, &status, P_ALL/*__WALL*/);
+
+  // It's theoretically possible to get other events if the entire process was
+  // SIGKILLed before we got a chance to check this. In that case, we'll just
+  // clean everything up when we get the process exit event.
+
+  LLDB_LOG(log,
+           "waitpid({0}, &status, __WALL) => {1} (errno: {2}, status = {3})",
+           tid, wait_pid, errno, WaitStatus::Decode(status));
+}
+
+// Public Instance Methods
+
+NativeProcessAIX::NativeProcessAIX(::pid_t pid, int terminal_fd,
+                                       NativeDelegate &delegate,
+                                       const ArchSpec &arch, Manager &manager,
+                                       llvm::ArrayRef<::pid_t> tids)
+    : NativeProcessProtocol(pid, terminal_fd, delegate), m_manager(manager),
+      m_arch(arch) {
+  manager.AddProcess(*this);
+  if (m_terminal_fd != -1) {
+    Status status = EnsureFDFlags(m_terminal_fd, O_NONBLOCK);
+    assert(status.Success());
+  }
+
+  for (const auto &tid : tids) {
+    NativeThreadAIX &thread = AddThread(tid, /*resume*/ false);
+    ThreadWasCreated(thread);
+  }
+
+  // Let our process instance know the thread has stopped.
+  SetCurrentThreadID(tids[0]);
+  SetState(StateType::eStateStopped, false);
+}
+
+llvm::Expected<std::vector<::pid_t>> NativeProcessAIX::Attach(::pid_t pid) {
+  Log *log = GetLog(POSIXLog::Process);
+
+  Status status;
+  if ((status = PtraceWrapper(PT_ATTACH, pid)).Fail()) {
+    return status.ToError();
+  }
+
+  int wpid =
+      llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, nullptr, WNOHANG);
+  if (wpid <= 0) {
+    return llvm::errorCodeToError(
+        std::error_code(errno, std::generic_category()));
+  }
+
+  LLDB_LOG(log, "adding pid = {0}", pid);
+
+  std::vector<::pid_t> tids;
+  tids.push_back(pid);
+  return std::move(tids);
+}
+
+bool NativeProcessAIX::TryHandleWaitStatus(lldb::pid_t pid,
+                                             WaitStatus status) {
+  if (pid == GetID() &&
+      (status.type == WaitStatus::Exit || status.type == WaitStatus::Signal)) {
+    // The process exited.  We're done monitoring.  Report to delegate.
+    SetExitStatus(status, true);
+    return true;
+  }
+  if (NativeThreadAIX *thread = GetThreadByID(pid)) {
+    MonitorCallback(*thread, status);
+    return true;
+  }
+  return false;
+}
+
+// Handles all waitpid events from the inferior process.
+void NativeProcessAIX::MonitorCallback(NativeThreadAIX &thread,
+                                         WaitStatus status) {
+  Log *log = GetLog(LLDBLog::Process);
+
+  // Certain activities differ based on whether the pid is the tid of the main
+  // thread.
+  const bool is_main_thread = (thread.GetID() == GetID());
+
+  // Handle when the thread exits.
+  if (status.type == WaitStatus::Exit || status.type == WaitStatus::Signal) {
+    LLDB_LOG(log,
+             "got exit status({0}) , tid = {1} ({2} main thread), process "
+             "state = {3}",
+             status, thread.GetID(), is_main_thread ? "is" : "is not",
+             GetState());
+
+    // This is a thread that exited.  Ensure we're not tracking it anymore.
+    StopTrackingThread(thread);
+
+    assert(!is_main_thread && "Main thread exits handled elsewhere");
+    return;
+  }
+
+  int8_t signo = GetSignalInfo(status);
+
+  // Get details on the signal raised.
+  if (signo) {
+    // We have retrieved the signal info.  Dispatch appropriately.
+    if (signo == SIGTRAP)
+      MonitorSIGTRAP(status, thread);
+    else
+      MonitorSignal(status, thread);
+  } else {
+    assert(0);
+  }
+}
+
+
+void NativeProcessAIX::MonitorSIGTRAP(const WaitStatus status,
+                                        NativeThreadAIX &thread) {
+  Log *log = GetLog(POSIXLog::Process);
+  const bool is_main_thread = (thread.GetID() == GetID());
+
+  NativeRegisterContextAIX &reg_ctx = thread.GetRegisterContext();
+  const RegisterInfo *pc_info = reg_ctx.GetRegisterInfoByName("pc", 0);
+  RegisterValue pc_value;
+
+  switch (status.status) {
+  case SIGTRAP:
+    // Determine the source of SIGTRAP by checking current instruction:
+    // if that is trap instruction, then this is breakpoint, otherwise
+    // this is watchpoint.
+    reg_ctx.ReadRegister(pc_info, pc_value);
+
+    MonitorBreakpoint(thread);
+    break;
+  default:
+    LLDB_LOG(log, "received unknown SIGTRAP stop event ({0}, pid {1} tid {2}",
+             status.status, GetID(), thread.GetID());
+    MonitorSignal(status, thread);
+    break;
+  }
+}
+
+void NativeProcessAIX::MonitorTrace(NativeThreadAIX &thread) {
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "received trace event, pid = {0}", thread.GetID());
+
+  // This thread is currently stopped.
+  thread.SetStoppedByTrace();
+
+  StopRunningThreads(thread.GetID());
+}
+
+void NativeProcessAIX::MonitorBreakpoint(NativeThreadAIX &thread) {
+  Log *log = GetLog(LLDBLog::Process | LLDBLog::Breakpoints);
+  LLDB_LOG(log, "received breakpoint event, pid = {0}", thread.GetID());
+
+  // Mark the thread as stopped at breakpoint.
+  thread.SetStoppedByBreakpoint();
+  FixupBreakpointPCAsNeeded(thread);
+
+  if (m_threads_stepping_with_breakpoint.find(thread.GetID()) !=
+      m_threads_stepping_with_breakpoint.end())
+    thread.SetStoppedByTrace();
+
+  StopRunningThreads(thread.GetID());
+}
+
+void NativeProcessAIX::MonitorWatchpoint(NativeThreadAIX &thread,
+                                           uint32_t wp_index) {
+  Log *log = GetLog(LLDBLog::Process | LLDBLog::Watchpoints);
+  LLDB_LOG(log, "received watchpoint event, pid = {0}, wp_index = {1}",
+           thread.GetID(), wp_index);
+
+  // Mark the thread as stopped at watchpoint. The address is at
+  // (lldb::addr_t)info->si_addr if we need it.
+  thread.SetStoppedByWatchpoint(wp_index);
+
+  // We need to tell all other running threads before we notify the delegate
+  // about this stop.
+  StopRunningThreads(thread.GetID());
+}
+
+void NativeProcessAIX::MonitorSignal(const WaitStatus status,
+                                       NativeThreadAIX &thread) {
+  int8_t signo = GetSignalInfo(status);
+#if 0
+  const bool is_from_llgs = info.si_pid == getpid();
+#endif
+
+  Log *log = GetLog(POSIXLog::Process);
+
+  // POSIX says that process behaviour is undefined after it ignores a SIGFPE,
+  // SIGILL, SIGSEGV, or SIGBUS *unless* that signal was generated by a kill(2)
+  // or raise(3).  Similarly for tgkill(2) on AIX.
+  //
+  // IOW, user generated signals never generate what we consider to be a
+  // "crash".
+  //
+  // Similarly, ACK signals generated by this monitor.
+
+  // Handle the signal.
+  LLDB_LOG(log,
+           "received signal {0} ({1}) with code NA, (siginfo pid = {2}, "
+           "waitpid pid = {3})",
+           Host::GetSignalAsCString(signo), signo, thread.GetID(), GetID());
+
+#if 0
+  // Check for thread stop notification.
+  // FIXME
+  if (is_from_llgs /*&& (info.si_code == SI_TKILL)*/ && (signo == SIGSTOP)) {
+    // This is a tgkill()-based stop.
+    LLDB_LOG(log, "pid {0} tid {1}, thread stopped", GetID(), thread.GetID());
+
+    // Check that we're not already marked with a stop reason. Note this thread
+    // really shouldn't already be marked as stopped - if we were, that would
+    // imply that the kernel signaled us with the thread stopping which we
+    // handled and marked as stopped, and that, without an intervening resume,
+    // we received another stop.  It is more likely that we are missing the
+    // marking of a run state somewhere if we find that the thread was marked
+    // as stopped.
+    const StateType thread_state = thread.GetState();
+    if (!StateIsStoppedState(thread_state, false)) {
+      // An inferior thread has stopped because of a SIGSTOP we have sent it.
+      // Generally, these are not important stops and we don't want to report
+      // them as they are just used to stop other threads when one thread (the
+      // one with the *real* stop reason) hits a breakpoint (watchpoint,
+      // etc...). However, in the case of an asynchronous Interrupt(), this
+      // *is* the real stop reason, so we leave the signal intact if this is
+      // the thread that was chosen as the triggering thread.
+      if (m_pending_notification_tid != LLDB_INVALID_THREAD_ID) {
+        if (m_pending_notification_tid == thread.GetID())
+          thread.SetStoppedBySignal(SIGSTOP, &info);
+        else
+          thread.SetStoppedWithNoReason();
+
+        SetCurrentThreadID(thread.GetID());
+        SignalIfAllThreadsStopped();
+      } else {
+        // We can end up here if stop was initiated by LLGS but by this time a
+        // thread stop has occurred - maybe initiated by another event.
+        Status error = ResumeThread(thread, thread.GetState(), 0);
+        if (error.Fail())
+          LLDB_LOG(log, "failed to resume thread {0}: {1}", thread.GetID(),
+                   error);
+      }
+    } else {
+      LLDB_LOG(log,
+               "pid {0} tid {1}, thread was already marked as a stopped "
+               "state (state={2}), leaving stop signal as is",
+               GetID(), thread.GetID(), thread_state);
+      SignalIfAllThreadsStopped();
+    }
+
+    // Done handling.
+    return;
+  }
+#endif
+
+  // Check if debugger should stop at this signal or just ignore it and resume
+  // the inferior.
+  if (m_signals_to_ignore.contains(signo) || signo == SIGCHLD) {
+     ResumeThread(thread, thread.GetState(), signo);
+     return;
+  }
+
+  // This thread is stopped.
+  LLDB_LOG(log, "received signal {0}", Host::GetSignalAsCString(signo));
+  thread.SetStoppedBySignal(signo);
+
+  // Send a stop to the debugger after we get all other threads to stop.
+  StopRunningThreads(thread.GetID());
+}
+
+bool NativeProcessAIX::MonitorClone(NativeThreadAIX &parent,
+                                      lldb::pid_t child_pid, int event) {
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "parent_tid={0}, child_pid={1}, event={2}", parent.GetID(),
+           child_pid, event);
+
+ // WaitForCloneNotification(child_pid);
+
+  switch (event) {
+#if 0
+  case PTRACE_EVENT_CLONE: {
+    // PTRACE_EVENT_CLONE can either mean a new thread or a new process.
+    // Try to grab the new process' PGID to figure out which one it is.
+    // If PGID is the same as the PID, then it's a new process.  Otherwise,
+    // it's a thread.
+    auto tgid_ret = getPIDForTID(child_pid);
+    if (tgid_ret != child_pid) {
+      // A new thread should have PGID matching our process' PID.
+      assert(!tgid_ret || tgid_ret.getValue() == GetID());
+
+      NativeThreadAIX &child_thread = AddThread(child_pid, /*resume*/ true);
+      ThreadWasCreated(child_thread);
+
+      // Resume the parent.
+      ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
+      break;
+    }
+  }
+    LLVM_FALLTHROUGH;
+  case PTRACE_EVENT_FORK:
+  case PTRACE_EVENT_VFORK: {
+    bool is_vfork = event == PTRACE_EVENT_VFORK;
+    std::unique_ptr<NativeProcessAIX> child_process{new NativeProcessAIX(
+        static_cast<::pid_t>(child_pid), m_terminal_fd, m_delegate, m_arch,
+        m_main_loop, {static_cast<::pid_t>(child_pid)})};
+    if (!is_vfork)
+      child_process->m_software_breakpoints = m_software_breakpoints;
+
+    Extension expected_ext = is_vfork ? Extension::vfork : Extension::fork;
+    if (bool(m_enabled_extensions & expected_ext)) {
+      m_delegate.NewSubprocess(this, std::move(child_process));
+      // NB: non-vfork clone() is reported as fork
+      parent.SetStoppedByFork(is_vfork, child_pid);
+      StopRunningThreads(parent.GetID());
+    } else {
+      child_process->Detach();
+      ResumeThread(parent, parent.GetState(), LLDB_INVALID_SIGNAL_NUMBER);
+    }
+    break;
+  }
+#endif
+  default:
+    llvm_unreachable("unknown clone_info.event");
+  }
+
+  return true;
+}
+
+bool NativeProcessAIX::SupportHardwareSingleStepping() const {
+  return false;
+}
+
+Status NativeProcessAIX::Resume(const ResumeActionList &resume_actions) {
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "pid {0}", GetID());
+
+  bool software_single_step = !SupportHardwareSingleStepping();
+
+  if (software_single_step) {
+    for (const auto &thread : m_threads) {
+      assert(thread && "thread list should not contain NULL threads");
+
+      const ResumeAction *const action =
+          resume_actions.GetActionForThread(thread->GetID(), true);
+      if (action == nullptr)
+        continue;
+
+      if (action->state == eStateStepping) {
+        Status error = SetupSoftwareSingleStepping(
+            static_cast<NativeThreadAIX &>(*thread));
+        if (error.Fail())
+          return error;
+      }
+    }
+  }
+
+  for (const auto &thread : m_threads) {
+    assert(thread && "thread list should not contain NULL threads");
+
+    const ResumeAction *const action =
+        resume_actions.GetActionForThread(thread->GetID(), true);
+
+    if (action == nullptr) {
+      LLDB_LOG(log, "no action specified for pid {0} tid {1}", GetID(),
+               thread->GetID());
+      continue;
+    }
+
+    LLDB_LOG(log, "processing resume action state {0} for pid {1} tid {2}",
+             action->state, GetID(), thread->GetID());
+
+    switch (action->state) {
+    case eStateRunning:
+    case eStateStepping: {
+      // Run the thread, possibly feeding it the signal.
+      const int signo = action->signal;
+      Status error = ResumeThread(static_cast<NativeThreadAIX &>(*thread),
+                                  action->state, signo);
+      if (error.Fail())
+        return Status("NativeProcessAIX::%s: failed to resume thread "
+                      "for pid %" PRIu64 ", tid %" PRIu64 ", error = %s",
+                      __FUNCTION__, GetID(), thread->GetID(),
+                      error.AsCString());
+
+      break;
+    }
+
+    case eStateSuspended:
+    case eStateStopped:
+      break;
+
+    default:
+      return Status("NativeProcessAIX::%s (): unexpected state %s specified "
+                    "for pid %" PRIu64 ", tid %" PRIu64,
+                    __FUNCTION__, StateAsCString(action->state), GetID(),
+                    thread->GetID());
+    }
+  }
+
+  return Status();
+}
+
+Status NativeProcessAIX::Halt() {
+  Status error;
+
+  if (kill(GetID(), SIGSTOP) != 0)
+    error.SetErrorToErrno();
+
+  return error;
+}
+
+Status NativeProcessAIX::Detach() {
+  Status error;
+
+  // Tell ptrace to detach from the process.
+  if (GetID() == LLDB_INVALID_PROCESS_ID)
+    return error;
+
+  // Cancel out any SIGSTOPs we may have sent while stopping the process.
+  // Otherwise, the process may stop as soon as we detach from it.
+  kill(GetID(), SIGCONT);
+
+  for (const auto &thread : m_threads) {
+    Status e = Detach(thread->GetID());
+    if (e.Fail())
+      error =
+          e; // Save the error, but still attempt to detach from other threads.
+  }
+
+  return error;
+}
+
+Status NativeProcessAIX::Signal(int signo) {
+  Status error;
+
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "sending signal {0} ({1}) to pid {1}", signo,
+           Host::GetSignalAsCString(signo), GetID());
+
+  if (kill(GetID(), signo))
+    error.SetErrorToErrno();
+
+  return error;
+}
+
+Status NativeProcessAIX::Interrupt() {
+  // Pick a running thread (or if none, a not-dead stopped thread) as the
+  // chosen thread that will be the stop-reason thread.
+  Log *log = GetLog(POSIXLog::Process);
+
+  NativeThreadProtocol *running_thread = nullptr;
+  NativeThreadProtocol *stopped_thread = nullptr;
+
+  LLDB_LOG(log, "selecting running thread for interrupt target");
+  for (const auto &thread : m_threads) {
+    // If we have a running or stepping thread, we'll call that the target of
+    // the interrupt.
+    const auto thread_state = thread->GetState();
+    if (thread_state == eStateRunning || thread_state == eStateStepping) {
+      running_thread = thread.get();
+      break;
+    } else if (!stopped_thread && StateIsStoppedState(thread_state, true)) {
+      // Remember the first non-dead stopped thread.  We'll use that as a
+      // backup if there are no running threads.
+      stopped_thread = thread.get();
+    }
+  }
+
+  if (!running_thread && !stopped_thread) {
+    Status error("found no running/stepping or live stopped threads as target "
+                 "for interrupt");
+    LLDB_LOG(log, "skipping due to error: {0}", error);
+
+    return error;
+  }
+
+  NativeThreadProtocol *deferred_signal_thread =
+      running_thread ? running_thread : stopped_thread;
+
+  LLDB_LOG(log, "pid {0} {1} tid {2} chosen for interrupt target", GetID(),
+           running_thread ? "running" : "stopped",
+           deferred_signal_thread->GetID());
+
+  StopRunningThreads(deferred_signal_thread->GetID());
+
+  return Status();
+}
+
+Status NativeProcessAIX::Kill() {
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "pid {0}", GetID());
+
+  Status error;
+
+  switch (m_state) {
+  case StateType::eStateInvalid:
+  case StateType::eStateExited:
+  case StateType::eStateCrashed:
+  case StateType::eStateDetached:
+  case StateType::eStateUnloaded:
+    // Nothing to do - the process is already dead.
+    LLDB_LOG(log, "ignored for PID {0} due to current state: {1}", GetID(),
+             m_state);
+    return error;
+
+  case StateType::eStateConnected:
+  case StateType::eStateAttaching:
+  case StateType::eStateLaunching:
+  case StateType::eStateStopped:
+  case StateType::eStateRunning:
+  case StateType::eStateStepping:
+  case StateType::eStateSuspended:
+    // We can try to kill a process in these states.
+    break;
+  }
+
+  if (kill(GetID(), SIGKILL) != 0) {
+    error.SetErrorToErrno();
+    return error;
+  }
+
+  return error;
+}
+
+Status NativeProcessAIX::GetMemoryRegionInfo(lldb::addr_t load_addr,
+                                               MemoryRegionInfo &range_info) {
+  // FIXME review that the final memory region returned extends to the end of
+  // the virtual address space,
+  // with no perms if it is not mapped.
+
+  // Use an approach that reads memory regions from /proc/{pid}/maps. Assume
+  // proc maps entries are in ascending order.
+  // FIXME assert if we find differently.
+
+  if (m_supports_mem_region == LazyBool::eLazyBoolNo) {
+    // We're done.
+    return Status("unsupported");
+  }
+
+  Status error = PopulateMemoryRegionCache();
+  if (error.Fail()) {
+    return error;
+  }
+
+  lldb::addr_t prev_base_address = 0;
+
+  // FIXME start by finding the last region that is <= target address using
+  // binary search.  Data is sorted.
+  // There can be a ton of regions on pthreads apps with lots of threads.
+  for (auto it = m_mem_region_cache.begin(); it != m_mem_region_cache.end();
+       ++it) {
+    MemoryRegionInfo &proc_entry_info = it->first;
+
+    // Sanity check assumption that /proc/{pid}/maps entries are ascending.
+    assert((proc_entry_info.GetRange().GetRangeBase() >= prev_base_address) &&
+           "descending /proc/pid/maps entries detected, unexpected");
+    prev_base_address = proc_entry_info.GetRange().GetRangeBase();
+    UNUSED_IF_ASSERT_DISABLED(prev_base_address);
+
+    // If the target address comes before this entry, indicate distance to next
+    // region.
+    if (load_addr < proc_entry_info.GetRange().GetRangeBase()) {
+      range_info.GetRange().SetRangeBase(load_addr);
+      range_info.GetRange().SetByteSize(
+          proc_entry_info.GetRange().GetRangeBase() - load_addr);
+      range_info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
+      range_info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
+      range_info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
+      range_info.SetMapped(MemoryRegionInfo::OptionalBool::eNo);
+
+      return error;
+    } else if (proc_entry_info.GetRange().Contains(load_addr)) {
+      // The target address is within the memory region we're processing here.
+      range_info = proc_entry_info;
+      return error;
+    }
+
+    // The target memory address comes somewhere after the region we just
+    // parsed.
+  }
+
+  // If we made it here, we didn't find an entry that contained the given
+  // address. Return the load_addr as start and the amount of bytes betwwen
+  // load address and the end of the memory as size.
+  range_info.GetRange().SetRangeBase(load_addr);
+  range_info.GetRange().SetRangeEnd(LLDB_INVALID_ADDRESS);
+  range_info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
+  range_info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
+  range_info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
+  range_info.SetMapped(MemoryRegionInfo::OptionalBool::eNo);
+  return error;
+}
+
+Status NativeProcessAIX::PopulateMemoryRegionCache() {
+  Log *log = GetLog(POSIXLog::Process);
+
+  // If our cache is empty, pull the latest.  There should always be at least
+  // one memory region if memory region handling is supported.
+  if (!m_mem_region_cache.empty()) {
+    LLDB_LOG(log, "reusing {0} cached memory region entries",
+             m_mem_region_cache.size());
+    return Status();
+  }
+
+  Status Result;
+#if 0
+  AIXMapCallback callback = [&](llvm::Expected<MemoryRegionInfo> Info) {
+    if (Info) {
+      FileSpec file_spec(Info->GetName().GetCString());
+      FileSystem::Instance().Resolve(file_spec);
+      m_mem_region_cache.emplace_back(*Info, file_spec);
+      return true;
+    }
+
+    Result = Info.takeError();
+    m_supports_mem_region = LazyBool::eLazyBoolNo;
+    LLDB_LOG(log, "failed to parse proc maps: {0}", Result);
+    return false;
+  };
+
+  // AIX kernel since 2.6.14 has /proc/{pid}/smaps
+  // if CONFIG_PROC_PAGE_MONITOR is enabled
+  auto BufferOrError = getProcFile(GetID(), GetCurrentThreadID(), "smaps");
+  if (BufferOrError)
+    ParseAIXSMapRegions(BufferOrError.get()->getBuffer(), callback);
+  else {
+    BufferOrError = getProcFile(GetID(), GetCurrentThreadID(), "maps");
+    if (!BufferOrError) {
+      m_supports_mem_region = LazyBool::eLazyBoolNo;
+      return BufferOrError.getError();
+    }
+
+    ParseAIXMapRegions(BufferOrError.get()->getBuffer(), callback);
+  }
+
+  if (Result.Fail())
+    return Result;
+
+  if (m_mem_region_cache.empty()) {
+    // No entries after attempting to read them.  This shouldn't happen if
+    // /proc/{pid}/maps is supported. Assume we don't support map entries via
+    // procfs.
+    m_supports_mem_region = LazyBool::eLazyBoolNo;
+    LLDB_LOG(log,
+             "failed to find any procfs maps entries, assuming no support "
+             "for memory region metadata retrieval");
+    return Status("not supported");
+  }
+
+  LLDB_LOG(log, "read {0} memory region entries from /proc/{1}/maps",
+           m_mem_region_cache.size(), GetID());
+
+  // We support memory retrieval, remember that.
+  m_supports_mem_region = LazyBool::eLazyBoolYes;
+#endif
+  return Status();
+}
+
+void NativeProcessAIX::DoStopIDBumped(uint32_t newBumpId) {
+  Log *log = GetLog(POSIXLog::Process);
+  LLDB_LOG(log, "newBumpId={0}", newBumpId);
+  LLDB_LOG(log, "clearing {0} entries from memory region cache",
+           m_mem_region_cache.size());
+  m_mem_region_cache.clear();
+}
+
+llvm::Expected<uint64_t>
+NativeProcessAIX::Syscall(llvm::ArrayRef<uint64_t> args) {
+  PopulateMemoryRegionCache();
+  auto region_it = llvm::find_if(m_mem_region_cache, [](const auto &pair) {
+    return pair.first.GetExecutable() == MemoryRegionInfo::eYes &&
+        pair.first.GetShared() != MemoryRegionInfo::eYes;
+  });
+  if (region_it == m_mem_region_cache.end())
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "No executable memory region found!");
+
+  addr_t exe_addr = region_it->first.GetRange().GetRangeBase();
+
+  NativeThreadAIX &thread = *GetCurrentThread();
+  assert(thread.GetState() == eStateStopped);
+  NativeRegisterContextAIX &reg_ctx = thread.GetRegisterContext();
+
+  NativeRegisterContextAIX::SyscallData syscall_data =
+      *reg_ctx.GetSyscallData();
+
+  WritableDataBufferSP registers_sp;
+  if (llvm::Error Err = reg_ctx.ReadAllRegisterValues(registers_sp).ToError())
+    return std::move(Err);
+  auto restore_regs = llvm::make_scope_exit(
+      [&] { reg_ctx.WriteAllRegisterValues(registers_sp); });
+
+  llvm::SmallVector<uint8_t, 8> memory(syscall_data.Insn.size());
+  size_t bytes_read;
+  if (llvm::Error Err =
+          ReadMemory(exe_addr, memory.data(), memory.size(), bytes_read)
+              .ToError()) {
+    return std::move(Err);
+  }
+
+  auto restore_mem = llvm::make_scope_exit(
+      [&] { WriteMemory(exe_addr, memory.data(), memory.size(), bytes_read); });
+
+  if (llvm::Error Err = reg_ctx.SetPC(exe_addr).ToError())
+    return std::move(Err);
+
+  for (const auto &zip : llvm::zip_first(args, syscall_data.Args)) {
+    if (llvm::Error Err =
+            reg_ctx
+                .WriteRegisterFromUnsigned(std::get<1>(zip), std::get<0>(zip))
+                .ToError()) {
+      return std::move(Err);
+    }
+  }
+  if (llvm::Error Err = WriteMemory(exe_addr, syscall_data.Insn.data(),
+                                    syscall_data.Insn.size(), bytes_read)
+                            .ToError())
+    return std::move(Err);
+
+  m_mem_region_cache.clear();
+
+  // With software single stepping the syscall insn buffer must also include a
+  // trap instruction to stop the process.
+  int req = SupportHardwareSingleStepping() ? PTRACE_SINGLESTEP : PTRACE_CONT;
+  if (llvm::Error Err =
+          PtraceWrapper(req, thread.GetID(), nullptr, nullptr).ToError())
+    return std::move(Err);
+
+  //FIXME
+  int status;
+  ::pid_t wait_pid = llvm::sys::RetryAfterSignal(-1, ::waitpid, thread.GetID(),
+                                                 &status, P_ALL/*__WALL*/);
+  if (wait_pid == -1) {
+    return llvm::errorCodeToError(
+        std::error_code(errno, std::generic_category()));
+  }
+  assert((unsigned)wait_pid == thread.GetID());
+
+  uint64_t result = reg_ctx.ReadRegisterAsUnsigned(syscall_data.Result, -ESRCH);
+
+  // Values larger than this are actually negative errno numbers.
+  uint64_t errno_threshold =
+      (uint64_t(-1) >> (64 - 8 * m_arch.GetAddressByteSize())) - 0x1000;
+  if (result > errno_threshold) {
+    return llvm::errorCodeToError(
+        std::error_code(-result & 0xfff, std::generic_category()));
+  }
+
+  return result;
+}
+
+llvm::Expected<addr_t>
+NativeProcessAIX::AllocateMemory(size_t size, uint32_t permissions) {
+
+  std::optional<NativeRegisterContextAIX::MmapData> mmap_data =
+      GetCurrentThread()->GetRegisterContext().GetMmapData();
+  if (!mmap_data)
+    return llvm::make_error<UnimplementedError>();
+
+  unsigned prot = PROT_NONE;
+  assert((permissions & (ePermissionsReadable | ePermissionsWritable |
+                         ePermissionsExecutable)) == permissions &&
+         "Unknown permission!");
+  if (permissions & ePermissionsReadable)
+    prot |= PROT_READ;
+  if (permissions & ePermissionsWritable)
+    prot |= PROT_WRITE;
+  if (permissions & ePermissionsExecutable)
+    prot |= PROT_EXEC;
+
+  llvm::Expected<uint64_t> Result =
+      Syscall({mmap_data->SysMmap, 0, size, prot, MAP_ANONYMOUS | MAP_PRIVATE,
+               uint64_t(-1), 0});
+  if (Result)
+    m_allocated_memory.try_emplace(*Result, size);
+  return Result;
+}
+
+llvm::Error NativeProcessAIX::DeallocateMemory(lldb::addr_t addr) {
+  std::optional<NativeRegisterContextAIX::MmapData> mmap_data =
+      GetCurrentThread()->GetRegisterContext().GetMmapData();
+  if (!mmap_data)
+    return llvm::make_error<UnimplementedError>();
+
+  auto it = m_allocated_memory.find(addr);
+  if (it == m_allocated_memory.end())
+    return llvm::createStringError(llvm::errc::invalid_argument,
+                                   "Memory not allocated by the debugger.");
+
+  llvm::Expected<uint64_t> Result =
+      Syscall({mmap_data->SysMunmap, addr, it->second});
+  if (!Result)
+    return Result.takeError();
+
+  m_allocated_memory.erase(it);
+  return llvm::Error::success();
+}
+
+Status NativeProcessAIX::ReadMemoryTags(int32_t type, lldb::addr_t addr,
+                                          size_t len,
+                                          std::vector<uint8_t> &tags) {
+  llvm::Expected<NativeRegisterContextAIX::MemoryTaggingDetails> details =
+      GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
+  if (!details)
+    return Status(details.takeError());
+
+  // Ignore 0 length read
+  if (!len)
+    return Status();
+
+  // lldb will align the range it requests but it is not required to by
+  // the protocol so we'll do it again just in case.
+  // Remove tag bits too. Ptrace calls may work regardless but that
+  // is not a guarantee.
+  MemoryTagManager::TagRange range(details->manager->RemoveTagBits(addr), len);
+  range = details->manager->ExpandToGranule(range);
+
+  // Allocate enough space for all tags to be read
+  size_t num_tags = range.GetByteSize() / details->manager->GetGranuleSize();
+  tags.resize(num_tags * details->manager->GetTagSizeInBytes());
+
+  struct iovec tags_iovec;
+  uint8_t *dest = tags.data();
+  lldb::addr_t read_addr = range.GetRangeBase();
+
+  // This call can return partial data so loop until we error or
+  // get all tags back.
+  while (num_tags) {
+    tags_iovec.iov_base = dest;
+    tags_iovec.iov_len = num_tags;
+
+    Status error = NativeProcessAIX::PtraceWrapper(
+        details->ptrace_read_req, GetCurrentThreadID(),
+        reinterpret_cast<void *>(read_addr), static_cast<void *>(&tags_iovec),
+        0, nullptr);
+
+    if (error.Fail()) {
+      // Discard partial reads
+      tags.resize(0);
+      return error;
+    }
+
+    size_t tags_read = tags_iovec.iov_len;
+    assert(tags_read && (tags_read <= num_tags));
+
+    dest += tags_read * details->manager->GetTagSizeInBytes();
+    read_addr += details->manager->GetGranuleSize() * tags_read;
+    num_tags -= tags_read;
+  }
+
+  return Status();
+}
+
+Status NativeProcessAIX::WriteMemoryTags(int32_t type, lldb::addr_t addr,
+                                           size_t len,
+                                           const std::vector<uint8_t> &tags) {
+  llvm::Expected<NativeRegisterContextAIX::MemoryTaggingDetails> details =
+      GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
+  if (!details)
+    return Status(details.takeError());
+
+  // Ignore 0 length write
+  if (!len)
+    return Status();
+
+  // lldb will align the range it requests but it is not required to by
+  // the protocol so we'll do it again just in case.
+  // Remove tag bits too. Ptrace calls may work regardless but that
+  // is not a guarantee.
+  MemoryTagManager::TagRange range(details->manager->RemoveTagBits(addr), len);
+  range = details->manager->ExpandToGranule(range);
+
+  // Not checking number of tags here, we may repeat them below
+  llvm::Expected<std::vector<lldb::addr_t>> unpacked_tags_or_err =
+      details->manager->UnpackTagsData(tags);
+  if (!unpacked_tags_or_err)
+    return Status(unpacked_tags_or_err.takeError());
+
+  llvm::Expected<std::vector<lldb::addr_t>> repeated_tags_or_err =
+      details->manager->RepeatTagsForRange(*unpacked_tags_or_err, range);
+  if (!repeated_tags_or_err)
+    return Status(repeated_tags_or_err.takeError());
+
+  // Repack them for ptrace to use
+  llvm::Expected<std::vector<uint8_t>> final_tag_data =
+      details->manager->PackTags(*repeated_tags_or_err);
+  if (!final_tag_data)
+    return Status(final_tag_data.takeError());
+
+  struct iovec tags_vec;
+  uint8_t *src = final_tag_data->data();
+  lldb::addr_t write_addr = range.GetRangeBase();
+  // unpacked tags size because the number of bytes per tag might not be 1
+  size_t num_tags = repeated_tags_or_err->size();
+
+  // This call can partially write tags, so we loop until we
+  // error or all tags have been written.
+  while (num_tags > 0) {
+    tags_vec.iov_base = src;
+    tags_vec.iov_len = num_tags;
+
+    Status error = NativeProcessAIX::PtraceWrapper(
+        details->ptrace_write_req, GetCurrentThreadID(),
+        reinterpret_cast<void *>(write_addr), static_cast<void *>(&tags_vec), 0,
+        nullptr);
+
+    if (error.Fail()) {
+      // Don't attempt to restore the original values in the case of a partial
+      // write
+      return error;
+    }
+
+    size_t tags_written = tags_vec.iov_len;
+    assert(tags_written && (tags_written <= num_tags));
+
+    src += tags_written * details->manager->GetTagSizeInBytes();
+    write_addr += details->manager->GetGranuleSize() * tags_written;
+    num_tags -= tags_written;
+  }
+
+  return Status();
+}
+
+size_t NativeProcessAIX::UpdateThreads() {
+  // The NativeProcessAIX monitoring threads are always up to date with
+  // respect to thread state and they keep the thread list populated properly.
+  // All this method needs to do is return the thread count.
+  return m_threads.size();
+}
+
+Status NativeProcessAIX::SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                                         bool hardware) {
+  if (hardware)
+    return SetHardwareBreakpoint(addr, size);
+  else
+    return SetSoftwareBreakpoint(addr, size);
+}
+
+Status NativeProcessAIX::RemoveBreakpoint(lldb::addr_t addr, bool hardware) {
+  if (hardware)
+    return RemoveHardwareBreakpoint(addr);
+  else
+    return NativeProcessProtocol::RemoveBreakpoint(addr);
+}
+
+llvm::Expected<llvm::ArrayRef<uint8_t>>
+NativeProcessAIX::GetSoftwareBreakpointTrapOpcode(size_t size_hint) {
+  // The ARM reference recommends the use of 0xe7fddefe and 0xdefe but the
+  // linux kernel does otherwise.
+  static const uint8_t g_arm_opcode[] = {0xf0, 0x01, 0xf0, 0xe7};
+  static const uint8_t g_thumb_opcode[] = {0x01, 0xde};
+
+  switch (GetArchitecture().GetMachine()) {
+  case llvm::Triple::arm:
+    switch (size_hint) {
+    case 2:
+      return llvm::ArrayRef(g_thumb_opcode);
+    case 4:
+      return llvm::ArrayRef(g_arm_opcode);
+    default:
+      return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                     "Unrecognised trap opcode size hint!");
+    }
+  default:
+    return NativeProcessProtocol::GetSoftwareBreakpointTrapOpcode(size_hint);
+  }
+}
+
+Status NativeProcessAIX::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                                      size_t &bytes_read) {
+  unsigned char *dst = static_cast<unsigned char *>(buf);
+  size_t remainder;
+  long data;
+
+  Log *log = GetLog(POSIXLog::Memory);
+  LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);
+
+  for (bytes_read = 0; bytes_read < size; bytes_read += remainder) {
+    Status error = NativeProcessAIX::PtraceWrapper(
+        PT_READ_BLOCK, GetCurrentThreadID(), (void *)addr, nullptr, sizeof(data), &data);
+    if (error.Fail())
+      return error;
+
+    remainder = size - bytes_read;
+    remainder = remainder > k_ptrace_word_size ? k_ptrace_word_size : remainder;
+
+    // Copy the data into our buffer
+    memcpy(dst, &data, remainder);
+
+    LLDB_LOG(log, "[{0:x}]:{1:x}", addr, data);
+    addr += k_ptrace_word_size;
+    dst += k_ptrace_word_size;
+  }
+  return Status();
+}
+
+Status NativeProcessAIX::WriteMemory(lldb::addr_t addr, const void *buf,
+                                       size_t size, size_t &bytes_written) {
+  const unsigned char *src = static_cast<const unsigned char *>(buf);
+  size_t remainder;
+  Status error;
+
+  Log *log = GetLog(POSIXLog::Memory);
+  LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);
+
+  error = NativeProcessAIX::PtraceWrapper(
+    PT_WRITE_BLOCK, GetCurrentThreadID(), (void *)addr, nullptr, (int)size, (long *)buf);
+  if (error.Fail())
+    return error;
+
+  bytes_written = size;
+  return error;
+}
+
+int8_t NativeProcessAIX::GetSignalInfo(WaitStatus wstatus) const {
+  return wstatus.status;
+}
+
+Status NativeProcessAIX::GetEventMessage(lldb::tid_t tid,
+                                           unsigned long *message) {
+  //FIXME
+  return PtraceWrapper(PT_CLEAR/*PTRACE_GETEVENTMSG*/, tid, nullptr, message);
+}
+
+Status NativeProcessAIX::Detach(lldb::tid_t tid) {
+  if (tid == LLDB_INVALID_THREAD_ID)
+    return Status();
+
+  return PtraceWrapper(PT_DETACH, tid);
+}
+
+bool NativeProcessAIX::HasThreadNoLock(lldb::tid_t thread_id) {
+  for (const auto &thread : m_threads) {
+    assert(thread && "thread list should not contain NULL threads");
+    if (thread->GetID() == thread_id) {
+      // We have this thread.
+      return true;
+    }
+  }
+
+  // We don't have this thread.
+  return false;
+}
+
+void NativeProcessAIX::StopTrackingThread(NativeThreadAIX &thread) {
+  Log *const log = GetLog(POSIXLog::Thread);
+  lldb::tid_t thread_id = thread.GetID();
+  LLDB_LOG(log, "tid: {0}", thread_id);
+
+  auto it = llvm::find_if(m_threads, [&](const auto &thread_up) {
+    return thread_up.get() == &thread;
+  });
+  assert(it != m_threads.end());
+  m_threads.erase(it);
+
+  NotifyTracersOfThreadDestroyed(thread_id);
+  SignalIfAllThreadsStopped();
+}
+
+void NativeProcessAIX::NotifyTracersProcessDidStop() {
+}
+
+void NativeProcessAIX::NotifyTracersProcessWillResume() { 
+}
+
+Status NativeProcessAIX::NotifyTracersOfNewThread(lldb::tid_t tid) {
+  Log *log = GetLog(POSIXLog::Thread);
+  Status error;
+  return error;
+}
+
+Status NativeProcessAIX::NotifyTracersOfThreadDestroyed(lldb::tid_t tid) {
+  Log *log = GetLog(POSIXLog::Thread);
+  Status error;
+  return error;
+}
+
+NativeThreadAIX &NativeProcessAIX::AddThread(lldb::tid_t thread_id,
+                                                 bool resume) {
+  Log *log = GetLog(POSIXLog::Thread);
+  LLDB_LOG(log, "pid {0} adding thread with tid {1}", GetID(), thread_id);
+
+  assert(!HasThreadNoLock(thread_id) &&
+         "attempted to add a thread by id that already exists");
+
+  // If this is the first thread, save it as the current thread
+  if (m_threads.empty())
+    SetCurrentThreadID(thread_id);
+
+  m_threads.push_back(std::make_unique<NativeThreadAIX>(*this, thread_id));
+  NativeThreadAIX &thread =
+      static_cast<NativeThreadAIX &>(*m_threads.back());
+
+  Status tracing_error = NotifyTracersOfNewThread(thread.GetID());
+  if (tracing_error.Fail()) {
+    thread.SetStoppedByProcessorTrace(tracing_error.AsCString());
+    StopRunningThreads(thread.GetID());
+  } else if (resume)
+    ResumeThread(thread, eStateRunning, LLDB_INVALID_SIGNAL_NUMBER);
+  else
+    thread.SetStoppedBySignal(SIGSTOP);
+
+  return thread;
+}
+
+Status NativeProcessAIX::GetLoadedModuleFileSpec(const char *module_path,
+                                                   FileSpec &file_spec) {
+  Status error = PopulateMemoryRegionCache();
+  if (error.Fail())
+    return error;
+
+  FileSpec module_file_spec(module_path);
+  FileSystem::Instance().Resolve(module_file_spec);
+
+  file_spec.Clear();
+  for (const auto &it : m_mem_region_cache) {
+    if (it.second.GetFilename() == module_file_spec.GetFilename()) {
+      file_spec = it.second;
+      return Status();
+    }
+  }
+  return Status("Module file (%s) not found in /proc/%" PRIu64 "/maps file!",
+                module_file_spec.GetFilename().AsCString(), GetID());
+}
+
+Status NativeProcessAIX::GetFileLoadAddress(const llvm::StringRef &file_name,
+                                              lldb::addr_t &load_addr) {
+  load_addr = LLDB_INVALID_ADDRESS;
+
+  NativeThreadAIX &thread = *GetCurrentThread();
+  NativeRegisterContextAIX &reg_ctx = thread.GetRegisterContext();
+
+  // FIXME: buffer size
+  struct ld_xinfo info[64];
+  if (ptrace64(PT_LDXINFO, reg_ctx.GetThread().GetID(), (long long)&(info[0]), sizeof(info), nullptr) == 0) {
+    load_addr = (unsigned long)info[0].ldinfo_textorg;
+    return Status();
+  }
+  return Status("No load address found for specified file.");
+}
+
+NativeThreadAIX *NativeProcessAIX::GetThreadByID(lldb::tid_t tid) {
+  return static_cast<NativeThreadAIX *>(
+      NativeProcessProtocol::GetThreadByID(tid));
+}
+
+NativeThreadAIX *NativeProcessAIX::GetCurrentThread() {
+  return static_cast<NativeThreadAIX *>(
+      NativeProcessProtocol::GetCurrentThread());
+}
+
+Status NativeProcessAIX::ResumeThread(NativeThreadAIX &thread,
+                                        lldb::StateType state, int signo) {
+  Log *const log = GetLog(POSIXLog::Thread);
+  LLDB_LOG(log, "tid: {0}", thread.GetID());
+
+  // Before we do the resume below, first check if we have a pending stop
+  // notification that is currently waiting for all threads to stop.  This is
+  // potentially a buggy situation since we're ostensibly waiting for threads
+  // to stop before we send out the pending notification, and here we are
+  // resuming one before we send out the pending stop notification.
+  if (m_pending_notification_tid != LLDB_INVALID_THREAD_ID) {
+    LLDB_LOG(log,
+             "about to resume tid {0} per explicit request but we have a "
+             "pending stop notification (tid {1}) that is actively "
+             "waiting for this thread to stop. Valid sequence of events?",
+             thread.GetID(), m_pending_notification_tid);
+  }
+
+  // Request a resume.  We expect this to be synchronous and the system to
+  // reflect it is running after this completes.
+  switch (state) {
+  case eStateRunning: {
+    const auto resume_result = thread.Resume(signo);
+    if (resume_result.Success())
+      SetState(eStateRunning, true);
+    return resume_result;
+  }
+  case eStateStepping: {
+    const auto step_result = thread.SingleStep(signo);
+    if (step_result.Success())
+      SetState(eStateRunning, true);
+    return step_result;
+  }
+  default:
+    LLDB_LOG(log, "Unhandled state {0}.", state);
+    llvm_unreachable("Unhandled state for resume");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+void NativeProcessAIX::StopRunningThreads(const lldb::tid_t triggering_tid) {
+  Log *const log = GetLog(POSIXLog::Thread);
+  LLDB_LOG(log, "about to process event: (triggering_tid: {0})",
+           triggering_tid);
+
+  m_pending_notification_tid = triggering_tid;
+
+  // Request a stop for all the thread stops that need to be stopped and are
+  // not already known to be stopped.
+  for (const auto &thread : m_threads) {
+    if (StateIsRunningState(thread->GetState()))
+      static_cast<NativeThreadAIX *>(thread.get())->RequestStop();
+  }
+
+  SignalIfAllThreadsStopped();
+  LLDB_LOG(log, "event processing done");
+}
+
+void NativeProcessAIX::SignalIfAllThreadsStopped() {
+  if (m_pending_notification_tid == LLDB_INVALID_THREAD_ID)
+    return; // No pending notification. Nothing to do.
+
+  for (const auto &thread_sp : m_threads) {
+    if (StateIsRunningState(thread_sp->GetState()))
+      return; // Some threads are still running. Don't signal yet.
+  }
+
+  // We have a pending notification and all threads have stopped.
+  Log *log = GetLog(LLDBLog::Process | LLDBLog::Breakpoints);
+
+  // Clear any temporary breakpoints we used to implement software single
+  // stepping.
+  for (const auto &thread_info : m_threads_stepping_with_breakpoint) {
+    Status error = RemoveBreakpoint(thread_info.second);
+    if (error.Fail())
+      LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}",
+               thread_info.first, error);
+  }
+  m_threads_stepping_with_breakpoint.clear();
+
+  // Notify the delegate about the stop
+  SetCurrentThreadID(m_pending_notification_tid);
+  SetState(StateType::eStateStopped, true);
+  m_pending_notification_tid = LLDB_INVALID_THREAD_ID;
+}
+
+void NativeProcessAIX::ThreadWasCreated(NativeThreadAIX &thread) {
+  Log *const log = GetLog(POSIXLog::Thread);
+  LLDB_LOG(log, "tid: {0}", thread.GetID());
+
+  if (m_pending_notification_tid != LLDB_INVALID_THREAD_ID &&
+      StateIsRunningState(thread.GetState())) {
+    // We will need to wait for this new thread to stop as well before firing
+    // the notification.
+    thread.RequestStop();
+  }
+}
+
+#define DECLARE_REGISTER_INFOS_PPC64LE_STRUCT
+#include "Plugins/Process/Utility/RegisterInfos_ppc64le.h"
+#undef DECLARE_REGISTER_INFOS_PPC64LE_STRUCT
+
+static void GetRegister(lldb::pid_t pid, long long addr, void *buf) {
+  uint64_t val = 0;
+  ptrace64(PT_READ_GPR, pid, addr, 0, (int *)&val);
+  *(uint64_t *)buf = llvm::byteswap<uint64_t>(val);
+}
+
+static void SetRegister(lldb::pid_t pid, long long addr, void *buf) {
+  uint64_t val = llvm::byteswap<uint64_t>(*(uint64_t *)buf);
+  ptrace64(PT_WRITE_GPR, pid, addr, 0, (int *)&val);
+}
+
+static void GetFPRegister(lldb::pid_t pid, long long addr, void *buf) {
+  uint64_t val = 0;
+  ptrace64(PT_READ_FPR, pid, addr, 0, (int *)&val);
+  *(uint64_t *)buf = llvm::byteswap<uint64_t>(val);
+}
+
+static void GetVMRegister(lldb::tid_t tid, long long addr, void *buf) {
+  uint64_t val = 0;
+  ptrace64(PTT_READ_VEC, tid, addr, 0, (int *)&val);
+  //*(uint64_t *)buf = llvm::byteswap<uint64_t>(val);
+}
+
+static void GetVSRegister(lldb::tid_t tid, long long addr, void *buf) {
+  uint64_t val = 0;
+  ptrace64(PTT_READ_VSX, tid, addr, 0, (int *)&val);
+  //*(uint64_t *)buf = llvm::byteswap<uint64_t>(val);
+}
+
+// Wrapper for ptrace to catch errors and log calls. Note that ptrace sets
+// errno on error because -1 can be a valid result (i.e. for PTRACE_PEEK*)
+Status NativeProcessAIX::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
+                                         void *data, size_t data_size,
+                                         long *result) {
+  Status error;
+  long int ret;
+
+  Log *log = GetLog(POSIXLog::Ptrace);
+
+  PtraceDisplayBytes(req, data, data_size);
+
+  errno = 0;
+
+  // for PTT_*
+  const char procdir[] = "/proc/";
+  const char lwpdir[] = "/lwp/";
+  std::string process_task_dir = procdir + std::to_string(pid) + lwpdir;
+  DIR *dirproc = opendir(process_task_dir.c_str());
+
+  lldb::tid_t tid = 0;
+  if (dirproc) {
+    struct dirent *direntry = nullptr;
+    while ((direntry = readdir(dirproc)) != nullptr) {
+      if (strcmp(direntry->d_name, ".") == 0 || strcmp(direntry->d_name, "..") == 0) {
+        continue;
+      }
+      tid = atoi(direntry->d_name);
+      break;
+    }
+    closedir(dirproc);
+  }
+
+  if (req == PTRACE_GETREGS) {
+    GetRegister(pid, GPR0, &(((GPR *)data)->r0));
+    GetRegister(pid, GPR1, &(((GPR *)data)->r1));
+    GetRegister(pid, GPR2, &(((GPR *)data)->r2));
+    GetRegister(pid, GPR3, &(((GPR *)data)->r3));
+    GetRegister(pid, GPR4, &(((GPR *)data)->r4));
+    GetRegister(pid, GPR5, &(((GPR *)data)->r5));
+    GetRegister(pid, GPR6, &(((GPR *)data)->r6));
+    GetRegister(pid, GPR7, &(((GPR *)data)->r7));
+    GetRegister(pid, GPR8, &(((GPR *)data)->r8));
+    GetRegister(pid, GPR9, &(((GPR *)data)->r9));
+    GetRegister(pid, GPR10, &(((GPR *)data)->r10));
+    GetRegister(pid, GPR11, &(((GPR *)data)->r11));
+    GetRegister(pid, GPR12, &(((GPR *)data)->r12));
+    GetRegister(pid, GPR13, &(((GPR *)data)->r13));
+    GetRegister(pid, GPR14, &(((GPR *)data)->r14));
+    GetRegister(pid, GPR15, &(((GPR *)data)->r15));
+    GetRegister(pid, GPR16, &(((GPR *)data)->r16));
+    GetRegister(pid, GPR17, &(((GPR *)data)->r17));
+    GetRegister(pid, GPR18, &(((GPR *)data)->r18));
+    GetRegister(pid, GPR19, &(((GPR *)data)->r19));
+    GetRegister(pid, GPR20, &(((GPR *)data)->r20));
+    GetRegister(pid, GPR21, &(((GPR *)data)->r21));
+    GetRegister(pid, GPR22, &(((GPR *)data)->r22));
+    GetRegister(pid, GPR23, &(((GPR *)data)->r23));
+    GetRegister(pid, GPR24, &(((GPR *)data)->r24));
+    GetRegister(pid, GPR25, &(((GPR *)data)->r25));
+    GetRegister(pid, GPR26, &(((GPR *)data)->r26));
+    GetRegister(pid, GPR27, &(((GPR *)data)->r27));
+    GetRegister(pid, GPR28, &(((GPR *)data)->r28));
+    GetRegister(pid, GPR29, &(((GPR *)data)->r29));
+    GetRegister(pid, GPR30, &(((GPR *)data)->r30));
+    GetRegister(pid, GPR31, &(((GPR *)data)->r31));
+    GetRegister(pid, IAR, &(((GPR *)data)->pc));
+    GetRegister(pid, MSR, &(((GPR *)data)->msr));
+    //FIXME: origr3/softe/trap on AIX?
+    GetRegister(pid, CTR, &(((GPR *)data)->ctr));
+    GetRegister(pid, LR, &(((GPR *)data)->lr));
+    GetRegister(pid, XER, &(((GPR *)data)->xer));
+    GetRegister(pid, CR, &(((GPR *)data)->cr));
+  } else if (req == PTRACE_SETREGS) {
+    SetRegister(pid, GPR0, &(((GPR *)data)->r0));
+    SetRegister(pid, GPR1, &(((GPR *)data)->r1));
+    SetRegister(pid, GPR2, &(((GPR *)data)->r2));
+    SetRegister(pid, GPR3, &(((GPR *)data)->r3));
+    SetRegister(pid, GPR4, &(((GPR *)data)->r4));
+    SetRegister(pid, GPR5, &(((GPR *)data)->r5));
+    SetRegister(pid, GPR6, &(((GPR *)data)->r6));
+    SetRegister(pid, GPR7, &(((GPR *)data)->r7));
+    SetRegister(pid, GPR8, &(((GPR *)data)->r8));
+    SetRegister(pid, GPR9, &(((GPR *)data)->r9));
+    SetRegister(pid, GPR10, &(((GPR *)data)->r10));
+    SetRegister(pid, GPR11, &(((GPR *)data)->r11));
+    SetRegister(pid, GPR12, &(((GPR *)data)->r12));
+    SetRegister(pid, GPR13, &(((GPR *)data)->r13));
+    SetRegister(pid, GPR14, &(((GPR *)data)->r14));
+    SetRegister(pid, GPR15, &(((GPR *)data)->r15));
+    SetRegister(pid, GPR16, &(((GPR *)data)->r16));
+    SetRegister(pid, GPR17, &(((GPR *)data)->r17));
+    SetRegister(pid, GPR18, &(((GPR *)data)->r18));
+    SetRegister(pid, GPR19, &(((GPR *)data)->r19));
+    SetRegister(pid, GPR20, &(((GPR *)data)->r20));
+    SetRegister(pid, GPR21, &(((GPR *)data)->r21));
+    SetRegister(pid, GPR22, &(((GPR *)data)->r22));
+    SetRegister(pid, GPR23, &(((GPR *)data)->r23));
+    SetRegister(pid, GPR24, &(((GPR *)data)->r24));
+    SetRegister(pid, GPR25, &(((GPR *)data)->r25));
+    SetRegister(pid, GPR26, &(((GPR *)data)->r26));
+    SetRegister(pid, GPR27, &(((GPR *)data)->r27));
+    SetRegister(pid, GPR28, &(((GPR *)data)->r28));
+    SetRegister(pid, GPR29, &(((GPR *)data)->r29));
+    SetRegister(pid, GPR30, &(((GPR *)data)->r30));
+    SetRegister(pid, GPR31, &(((GPR *)data)->r31));
+    SetRegister(pid, IAR, &(((GPR *)data)->pc));
+    SetRegister(pid, MSR, &(((GPR *)data)->msr));
+    //FIXME: origr3/softe/trap on AIX?
+    SetRegister(pid, CTR, &(((GPR *)data)->ctr));
+    SetRegister(pid, LR, &(((GPR *)data)->lr));
+    SetRegister(pid, XER, &(((GPR *)data)->xer));
+    SetRegister(pid, CR, &(((GPR *)data)->cr));
+  } else if (req == PTRACE_GETFPREGS) {
+    GetFPRegister(pid, FPR0, &(((FPR *)data)->f0));
+    GetFPRegister(pid, FPR1, &(((FPR *)data)->f1));
+    GetFPRegister(pid, FPR2, &(((FPR *)data)->f2));
+    GetFPRegister(pid, FPR3, &(((FPR *)data)->f3));
+    GetFPRegister(pid, FPR4, &(((FPR *)data)->f4));
+    GetFPRegister(pid, FPR5, &(((FPR *)data)->f5));
+    GetFPRegister(pid, FPR6, &(((FPR *)data)->f6));
+    GetFPRegister(pid, FPR7, &(((FPR *)data)->f7));
+    GetFPRegister(pid, FPR8, &(((FPR *)data)->f8));
+    GetFPRegister(pid, FPR9, &(((FPR *)data)->f9));
+    GetFPRegister(pid, FPR10, &(((FPR *)data)->f10));
+    GetFPRegister(pid, FPR11, &(((FPR *)data)->f11));
+    GetFPRegister(pid, FPR12, &(((FPR *)data)->f12));
+    GetFPRegister(pid, FPR13, &(((FPR *)data)->f13));
+    GetFPRegister(pid, FPR14, &(((FPR *)data)->f14));
+    GetFPRegister(pid, FPR15, &(((FPR *)data)->f15));
+    GetFPRegister(pid, FPR16, &(((FPR *)data)->f16));
+    GetFPRegister(pid, FPR17, &(((FPR *)data)->f17));
+    GetFPRegister(pid, FPR18, &(((FPR *)data)->f18));
+    GetFPRegister(pid, FPR19, &(((FPR *)data)->f19));
+    GetFPRegister(pid, FPR20, &(((FPR *)data)->f20));
+    GetFPRegister(pid, FPR21, &(((FPR *)data)->f21));
+    GetFPRegister(pid, FPR22, &(((FPR *)data)->f22));
+    GetFPRegister(pid, FPR23, &(((FPR *)data)->f23));
+    GetFPRegister(pid, FPR24, &(((FPR *)data)->f24));
+    GetFPRegister(pid, FPR25, &(((FPR *)data)->f25));
+    GetFPRegister(pid, FPR26, &(((FPR *)data)->f26));
+    GetFPRegister(pid, FPR27, &(((FPR *)data)->f27));
+    GetFPRegister(pid, FPR28, &(((FPR *)data)->f28));
+    GetFPRegister(pid, FPR29, &(((FPR *)data)->f29));
+    GetFPRegister(pid, FPR30, &(((FPR *)data)->f30));
+    GetFPRegister(pid, FPR31, &(((FPR *)data)->f31));
+    GetFPRegister(pid, FPSCR, &(((FPR *)data)->fpscr));
+  } else if (req == PTRACE_GETVRREGS && tid) {
+    GetVMRegister(tid, VR0, &(((VMX *)data)->vr0[0]));
+    GetVMRegister(tid, VR1, &(((VMX *)data)->vr1[0]));
+    GetVMRegister(tid, VR2, &(((VMX *)data)->vr2[0]));
+    GetVMRegister(tid, VR3, &(((VMX *)data)->vr3[0]));
+    GetVMRegister(tid, VR4, &(((VMX *)data)->vr4[0]));
+    GetVMRegister(tid, VR5, &(((VMX *)data)->vr5[0]));
+    GetVMRegister(tid, VR6, &(((VMX *)data)->vr6[0]));
+    GetVMRegister(tid, VR7, &(((VMX *)data)->vr7[0]));
+    GetVMRegister(tid, VR8, &(((VMX *)data)->vr8[0]));
+    GetVMRegister(tid, VR9, &(((VMX *)data)->vr9[0]));
+    GetVMRegister(tid, VR10, &(((VMX *)data)->vr10[0]));
+    GetVMRegister(tid, VR11, &(((VMX *)data)->vr11[0]));
+    GetVMRegister(tid, VR12, &(((VMX *)data)->vr12[0]));
+    GetVMRegister(tid, VR13, &(((VMX *)data)->vr13[0]));
+    GetVMRegister(tid, VR14, &(((VMX *)data)->vr14[0]));
+    GetVMRegister(tid, VR15, &(((VMX *)data)->vr15[0]));
+    GetVMRegister(tid, VR16, &(((VMX *)data)->vr16[0]));
+    GetVMRegister(tid, VR17, &(((VMX *)data)->vr17[0]));
+    GetVMRegister(tid, VR18, &(((VMX *)data)->vr18[0]));
+    GetVMRegister(tid, VR19, &(((VMX *)data)->vr19[0]));
+    GetVMRegister(tid, VR20, &(((VMX *)data)->vr20[0]));
+    GetVMRegister(tid, VR21, &(((VMX *)data)->vr21[0]));
+    GetVMRegister(tid, VR22, &(((VMX *)data)->vr22[0]));
+    GetVMRegister(tid, VR23, &(((VMX *)data)->vr23[0]));
+    GetVMRegister(tid, VR24, &(((VMX *)data)->vr24[0]));
+    GetVMRegister(tid, VR25, &(((VMX *)data)->vr25[0]));
+    GetVMRegister(tid, VR26, &(((VMX *)data)->vr26[0]));
+    GetVMRegister(tid, VR27, &(((VMX *)data)->vr27[0]));
+    GetVMRegister(tid, VR28, &(((VMX *)data)->vr28[0]));
+    GetVMRegister(tid, VR29, &(((VMX *)data)->vr29[0]));
+    GetVMRegister(tid, VR30, &(((VMX *)data)->vr30[0]));
+    GetVMRegister(tid, VR31, &(((VMX *)data)->vr31[0]));
+    GetVMRegister(tid, VSCR, &(((VMX *)data)->vscr[0]));
+    GetVMRegister(tid, VRSAVE, &(((VMX *)data)->vrsave));
+  } else if (req == PTRACE_GETVSRREGS && tid) {
+    GetVSRegister(tid, VSR0, &(((VSX *)data)->vs0[0]));
+    GetVSRegister(tid, VSR1, &(((VSX *)data)->vs1[0]));
+    GetVSRegister(tid, VSR2, &(((VSX *)data)->vs2[0]));
+    GetVSRegister(tid, VSR3, &(((VSX *)data)->vs3[0]));
+    GetVSRegister(tid, VSR4, &(((VSX *)data)->vs4[0]));
+    GetVSRegister(tid, VSR5, &(((VSX *)data)->vs5[0]));
+    GetVSRegister(tid, VSR6, &(((VSX *)data)->vs6[0]));
+    GetVSRegister(tid, VSR7, &(((VSX *)data)->vs7[0]));
+    GetVSRegister(tid, VSR8, &(((VSX *)data)->vs8[0]));
+    GetVSRegister(tid, VSR9, &(((VSX *)data)->vs9[0]));
+    GetVSRegister(tid, VSR10, &(((VSX *)data)->vs10[0]));
+    GetVSRegister(tid, VSR11, &(((VSX *)data)->vs11[0]));
+    GetVSRegister(tid, VSR12, &(((VSX *)data)->vs12[0]));
+    GetVSRegister(tid, VSR13, &(((VSX *)data)->vs13[0]));
+    GetVSRegister(tid, VSR14, &(((VSX *)data)->vs14[0]));
+    GetVSRegister(tid, VSR15, &(((VSX *)data)->vs15[0]));
+    GetVSRegister(tid, VSR16, &(((VSX *)data)->vs16[0]));
+    GetVSRegister(tid, VSR17, &(((VSX *)data)->vs17[0]));
+    GetVSRegister(tid, VSR18, &(((VSX *)data)->vs18[0]));
+    GetVSRegister(tid, VSR19, &(((VSX *)data)->vs19[0]));
+    GetVSRegister(tid, VSR20, &(((VSX *)data)->vs20[0]));
+    GetVSRegister(tid, VSR21, &(((VSX *)data)->vs21[0]));
+    GetVSRegister(tid, VSR22, &(((VSX *)data)->vs22[0]));
+    GetVSRegister(tid, VSR23, &(((VSX *)data)->vs23[0]));
+    GetVSRegister(tid, VSR24, &(((VSX *)data)->vs24[0]));
+    GetVSRegister(tid, VSR25, &(((VSX *)data)->vs25[0]));
+    GetVSRegister(tid, VSR26, &(((VSX *)data)->vs26[0]));
+    GetVSRegister(tid, VSR27, &(((VSX *)data)->vs27[0]));
+    GetVSRegister(tid, VSR28, &(((VSX *)data)->vs28[0]));
+    GetVSRegister(tid, VSR29, &(((VSX *)data)->vs29[0]));
+    GetVSRegister(tid, VSR30, &(((VSX *)data)->vs30[0]));
+    GetVSRegister(tid, VSR31, &(((VSX *)data)->vs31[0]));
+    GetVSRegister(tid, VSR32, &(((VSX *)data)->vs32[0]));
+    GetVSRegister(tid, VSR33, &(((VSX *)data)->vs33[0]));
+    GetVSRegister(tid, VSR34, &(((VSX *)data)->vs34[0]));
+    GetVSRegister(tid, VSR35, &(((VSX *)data)->vs35[0]));
+    GetVSRegister(tid, VSR36, &(((VSX *)data)->vs36[0]));
+    GetVSRegister(tid, VSR37, &(((VSX *)data)->vs37[0]));
+    GetVSRegister(tid, VSR38, &(((VSX *)data)->vs38[0]));
+    GetVSRegister(tid, VSR39, &(((VSX *)data)->vs39[0]));
+    GetVSRegister(tid, VSR40, &(((VSX *)data)->vs40[0]));
+    GetVSRegister(tid, VSR41, &(((VSX *)data)->vs41[0]));
+    GetVSRegister(tid, VSR42, &(((VSX *)data)->vs42[0]));
+    GetVSRegister(tid, VSR43, &(((VSX *)data)->vs43[0]));
+    GetVSRegister(tid, VSR44, &(((VSX *)data)->vs44[0]));
+    GetVSRegister(tid, VSR45, &(((VSX *)data)->vs45[0]));
+    GetVSRegister(tid, VSR46, &(((VSX *)data)->vs46[0]));
+    GetVSRegister(tid, VSR47, &(((VSX *)data)->vs47[0]));
+    GetVSRegister(tid, VSR48, &(((VSX *)data)->vs48[0]));
+    GetVSRegister(tid, VSR49, &(((VSX *)data)->vs49[0]));
+    GetVSRegister(tid, VSR50, &(((VSX *)data)->vs50[0]));
+    GetVSRegister(tid, VSR51, &(((VSX *)data)->vs51[0]));
+    GetVSRegister(tid, VSR52, &(((VSX *)data)->vs52[0]));
+    GetVSRegister(tid, VSR53, &(((VSX *)data)->vs53[0]));
+    GetVSRegister(tid, VSR54, &(((VSX *)data)->vs54[0]));
+    GetVSRegister(tid, VSR55, &(((VSX *)data)->vs55[0]));
+    GetVSRegister(tid, VSR56, &(((VSX *)data)->vs56[0]));
+    GetVSRegister(tid, VSR57, &(((VSX *)data)->vs57[0]));
+    GetVSRegister(tid, VSR58, &(((VSX *)data)->vs58[0]));
+    GetVSRegister(tid, VSR59, &(((VSX *)data)->vs59[0]));
+    GetVSRegister(tid, VSR60, &(((VSX *)data)->vs60[0]));
+    GetVSRegister(tid, VSR61, &(((VSX *)data)->vs61[0]));
+    GetVSRegister(tid, VSR62, &(((VSX *)data)->vs62[0]));
+    GetVSRegister(tid, VSR63, &(((VSX *)data)->vs63[0]));
+  } else if (req < PT_COMMAND_MAX) {
+    if (req == PT_CONTINUE) {
+#if 0
+      // Use PTT_CONTINUE
+      const char procdir[] = "/proc/";
+      const char lwpdir[] = "/lwp/";
+      std::string process_task_dir = procdir + std::to_string(pid) + lwpdir;
+      DIR *dirproc = opendir(process_task_dir.c_str());
+
+      struct ptthreads64 pts;
+      int idx = 0;
+      lldb::tid_t tid = 0;
+      if (dirproc) {
+	struct dirent *direntry = nullptr;
+	while ((direntry = readdir(dirproc)) != nullptr) {
+          if (strcmp(direntry->d_name, ".") == 0 || strcmp(direntry->d_name, "..") == 0) {
+            continue;
+          }
+          tid = atoi(direntry->d_name);
+          pts.th[idx++] = tid;
+	}
+	closedir(dirproc);
+      }
+      pts.th[idx] = 0;
+      ret = ptrace64(PTT_CONTINUE, tid, (long long)1, (int)(size_t)data, (int *)&pts);
+#else
+      int buf;
+      ptrace64(req, pid, 1, (int)(size_t)data, &buf);
+#endif
+    } else if (req == PT_READ_BLOCK) {
+      ptrace64(req, pid, (long long)addr, (int)data_size, (int *)result);
+    } else if (req == PT_WRITE_BLOCK) {
+      ptrace64(req, pid, (long long)addr, (int)data_size, (int *)result);
+    } else if (req == PT_ATTACH) {
+      ptrace64(req, pid, 0, 0, nullptr);
+    } else if (req == PT_WATCH) {
+      ptrace64(req, pid, (long long)addr, (int)data_size, nullptr);
+    } else if (req == PT_DETACH) {
+      ptrace64(req, pid, 0, 0, nullptr);
+    } else {
+      assert(0 && "Not supported yet.");
+    }
+  } else {
+    assert(0 && "Not supported yet.");
+  }
+
+  if (errno) {
+    error.SetErrorToErrno();
+    ret = -1;
+  }
+
+  LLDB_LOG(log, "ptrace({0}, {1}, {2}, {3}, {4})={5:x}", req, pid, addr, data,
+           data_size, ret);
+
+  PtraceDisplayBytes(req, data, data_size);
+
+  if (error.Fail())
+    LLDB_LOG(log, "ptrace() failed: {0}", error);
+
+  return error;
+}
+
+llvm::Expected<TraceSupportedResponse> NativeProcessAIX::TraceSupported() {
+  return NativeProcessProtocol::TraceSupported();
+}
+
+Error NativeProcessAIX::TraceStart(StringRef json_request, StringRef type) {
+  return NativeProcessProtocol::TraceStart(json_request, type);
+}
+
+Error NativeProcessAIX::TraceStop(const TraceStopRequest &request) {
+  return NativeProcessProtocol::TraceStop(request);
+}
+
+Expected<json::Value> NativeProcessAIX::TraceGetState(StringRef type) {
+  return NativeProcessProtocol::TraceGetState(type);
+}
+
+Expected<std::vector<uint8_t>> NativeProcessAIX::TraceGetBinaryData(
+    const TraceGetBinaryDataRequest &request) {
+  return NativeProcessProtocol::TraceGetBinaryData(request);
+}
diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.h b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.h
new file mode 100644
index 0000000000000..bdb6f7c500885
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.h
@@ -0,0 +1,283 @@
+//===-- NativeProcessAIX.h ---------------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef liblldb_NativeProcessAIX_H_
+#define liblldb_NativeProcessAIX_H_
+
+#include <csignal>
+#include <unordered_set>
+
+#include "lldb/Host/Debug.h"
+#include "lldb/Host/HostThread.h"
+#include "lldb/Target/MemoryRegionInfo.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/lldb-types.h"
+#include "llvm/ADT/SmallPtrSet.h" 
+#include "lldb/Host/aix/Support.h"
+
+#include "NativeThreadAIX.h"
+#include "lldb/Host/common/NativeProcessProtocol.h"
+#include "Plugins/Process/Utility/NativeProcessSoftwareSingleStep.h"
+
+namespace lldb_private {
+class Status;
+class Scalar;
+
+namespace process_aix {
+/// \class NativeProcessAIX
+/// Manages communication with the inferior (debugee) process.
+///
+/// Upon construction, this class prepares and launches an inferior process
+/// for debugging.
+///
+/// Changes in the inferior process state are broadcasted.
+class NativeProcessAIX : public NativeProcessProtocol,
+                           private NativeProcessSoftwareSingleStep {
+public:
+  class Manager : public NativeProcessProtocol::Manager {
+  public:
+	Manager(MainLoop &mainloop);
+
+    llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
+    Launch(ProcessLaunchInfo &launch_info,
+            NativeDelegate &native_delegate) override;
+
+    llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
+    Attach(lldb::pid_t pid, NativeDelegate &native_delegate) override;
+
+    Extension GetSupportedExtensions() const override;
+
+    void AddProcess(NativeProcessAIX &process) {
+      m_processes.insert(&process);
+    }
+
+    void RemoveProcess(NativeProcessAIX &process) {
+      m_processes.erase(&process);
+    }
+
+    // Collect an event for the given tid, waiting for it if necessary.
+    void CollectThread(::pid_t tid);
+
+  private:
+    MainLoop::SignalHandleUP m_sigchld_handle;
+
+    llvm::SmallPtrSet<NativeProcessAIX *, 2> m_processes;
+
+    // Threads (events) which haven't been claimed by any process.
+    llvm::DenseSet<::pid_t> m_unowned_threads;
+
+	void SigchldHandler();
+  };
+
+  // NativeProcessProtocol Interface
+
+  ~NativeProcessAIX() override { m_manager.RemoveProcess(*this); }
+
+  Status Resume(const ResumeActionList &resume_actions) override;
+
+  Status Halt() override;
+
+  Status Detach() override;
+
+  Status Signal(int signo) override;
+
+  Status Interrupt() override;
+
+  Status Kill() override;
+
+  lldb::addr_t GetSharedLibraryInfoAddress() override;
+
+  Status GetMemoryRegionInfo(lldb::addr_t load_addr,
+                             MemoryRegionInfo &range_info) override;
+
+  Status ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                    size_t &bytes_read) override;
+
+  Status WriteMemory(lldb::addr_t addr, const void *buf, size_t size,
+                     size_t &bytes_written) override;
+
+  llvm::Expected<lldb::addr_t> AllocateMemory(size_t size,
+                                              uint32_t permissions) override;
+
+  llvm::Error DeallocateMemory(lldb::addr_t addr) override;
+
+  Status ReadMemoryTags(int32_t type, lldb::addr_t addr, size_t len,
+                        std::vector<uint8_t> &tags) override;
+
+  Status WriteMemoryTags(int32_t type, lldb::addr_t addr, size_t len,
+                         const std::vector<uint8_t> &tags) override;
+
+  size_t UpdateThreads() override;
+
+  const ArchSpec &GetArchitecture() const override { return m_arch; }
+
+  Status SetBreakpoint(lldb::addr_t addr, uint32_t size,
+                       bool hardware) override;
+
+  Status RemoveBreakpoint(lldb::addr_t addr, bool hardware = false) override;
+
+  void DoStopIDBumped(uint32_t newBumpId) override;
+
+  Status GetLoadedModuleFileSpec(const char *module_path,
+                                 FileSpec &file_spec) override;
+
+  Status GetFileLoadAddress(const llvm::StringRef &file_name,
+                            lldb::addr_t &load_addr) override;
+
+  NativeThreadAIX *GetThreadByID(lldb::tid_t id);
+  NativeThreadAIX *GetCurrentThread();
+
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
+  GetAuxvData() const override {
+    // Not available on this target.
+    return llvm::errc::not_supported;
+  }
+
+  /// Tracing
+  /// These methods implement the jLLDBTrace packets
+  /// \{
+  llvm::Error TraceStart(llvm::StringRef json_request,
+                         llvm::StringRef type) override;
+
+  llvm::Error TraceStop(const TraceStopRequest &request) override;
+
+  llvm::Expected<llvm::json::Value>
+  TraceGetState(llvm::StringRef type) override;
+
+  llvm::Expected<std::vector<uint8_t>>
+  TraceGetBinaryData(const TraceGetBinaryDataRequest &request) override;
+
+  llvm::Expected<TraceSupportedResponse> TraceSupported() override;
+  /// }
+
+  // Interface used by NativeRegisterContext-derived classes.
+  static Status PtraceWrapper(int req, lldb::pid_t pid, void *addr = nullptr,
+                              void *data = nullptr, size_t data_size = 0,
+                              long *result = nullptr);
+
+  bool SupportHardwareSingleStepping() const;
+
+  /// Writes a siginfo_t structure corresponding to the given thread ID to the
+  /// memory region pointed to by \p siginfo.
+  int8_t GetSignalInfo(WaitStatus wstatus) const;
+
+protected:
+  llvm::Expected<llvm::ArrayRef<uint8_t>>
+  GetSoftwareBreakpointTrapOpcode(size_t size_hint) override;
+
+  llvm::Expected<uint64_t> Syscall(llvm::ArrayRef<uint64_t> args);
+
+private:
+  Manager &m_manager;
+  /*MainLoop::SignalHandleUP m_sigchld_handle;*/
+  ArchSpec m_arch;
+  /*MainLoop& m_main_loop;*/
+
+  LazyBool m_supports_mem_region = eLazyBoolCalculate;
+  std::vector<std::pair<MemoryRegionInfo, FileSpec>> m_mem_region_cache;
+
+  lldb::tid_t m_pending_notification_tid = LLDB_INVALID_THREAD_ID;
+
+  /// Inferior memory (allocated by us) and its size.
+  llvm::DenseMap<lldb::addr_t, lldb::addr_t> m_allocated_memory;
+
+  // Private Instance Methods
+  NativeProcessAIX(::pid_t pid, int terminal_fd, NativeDelegate &delegate,
+                     const ArchSpec &arch, Manager &manager,
+                     llvm::ArrayRef<::pid_t> tids);
+
+  // Returns a list of process threads that we have attached to.
+  static llvm::Expected<std::vector<::pid_t>> Attach(::pid_t pid);
+
+  static Status SetDefaultPtraceOpts(const lldb::pid_t);
+
+  bool TryHandleWaitStatus(lldb::pid_t pid, WaitStatus status);
+
+  void MonitorCallback(NativeThreadAIX &thread, WaitStatus status);
+
+  void MonitorSIGTRAP(const WaitStatus status, NativeThreadAIX &thread);
+
+  void MonitorTrace(NativeThreadAIX &thread);
+
+  void MonitorBreakpoint(NativeThreadAIX &thread);
+
+  void MonitorWatchpoint(NativeThreadAIX &thread, uint32_t wp_index);
+
+  void MonitorSignal(const WaitStatus status, NativeThreadAIX &thread);
+
+  bool HasThreadNoLock(lldb::tid_t thread_id);
+
+  void StopTrackingThread(NativeThreadAIX &thread);
+
+  /// Create a new thread.
+  ///
+  /// If process tracing is enabled and the thread can't be traced, then the
+  /// thread is left stopped with a \a eStopReasonProcessorTrace status, and
+  /// then the process is stopped.
+  ///
+  /// \param[in] resume
+  ///     If a tracing error didn't happen, then resume the thread after
+  ///     creation if \b true, or leave it stopped with SIGSTOP if \b false.
+  NativeThreadAIX &AddThread(lldb::tid_t thread_id, bool resume);
+
+  /// Start tracing a new thread if process tracing is enabled.
+  ///
+  /// Trace mechanisms should modify this method to provide automatic tracing
+  /// for new threads.
+  Status NotifyTracersOfNewThread(lldb::tid_t tid);
+
+  /// Stop tracing threads upon a destroy event.
+  ///
+  /// Trace mechanisms should modify this method to provide automatic trace
+  /// stopping for threads being destroyed.
+  Status NotifyTracersOfThreadDestroyed(lldb::tid_t tid);
+
+  void NotifyTracersProcessWillResume() override;
+
+  void NotifyTracersProcessDidStop() override;
+  /// Writes the raw event message code (vis-a-vis PTRACE_GETEVENTMSG)
+  /// corresponding to the given thread ID to the memory pointed to by @p
+  /// message.
+  Status GetEventMessage(lldb::tid_t tid, unsigned long *message);
+
+  void NotifyThreadDeath(lldb::tid_t tid);
+
+  Status Detach(lldb::tid_t tid);
+
+  // This method is requests a stop on all threads which are still running. It
+  // sets up a
+  // deferred delegate notification, which will fire once threads report as
+  // stopped. The
+  // triggerring_tid will be set as the current thread (main stop reason).
+  void StopRunningThreads(lldb::tid_t triggering_tid);
+
+  // Notify the delegate if all threads have stopped.
+  void SignalIfAllThreadsStopped();
+
+  // Resume the given thread, optionally passing it the given signal. The type
+  // of resume
+  // operation (continue, single-step) depends on the state parameter.
+  Status ResumeThread(NativeThreadAIX &thread, lldb::StateType state,
+                      int signo);
+
+  void ThreadWasCreated(NativeThreadAIX &thread);
+
+  void SigchldHandler();
+
+  Status PopulateMemoryRegionCache();
+
+  // Handle a clone()-like event.
+  bool MonitorClone(NativeThreadAIX &parent, lldb::pid_t child_pid,
+                    int event);
+};
+
+} // namespace process_aix
+} // namespace lldb_private
+
+#endif // #ifndef liblldb_NativeProcessAIX_H_
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
new file mode 100644
index 0000000000000..0859f9501c1b6
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
@@ -0,0 +1,157 @@
+//===-- NativeRegisterContextAIX.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NativeRegisterContextAIX.h"
+
+#include "lldb/Host/common/NativeProcessProtocol.h"
+#include "lldb/Host/common/NativeThreadProtocol.h"
+#include "lldb/Utility/RegisterValue.h"
+
+#include "Plugins/Process/AIX/NativeProcessAIX.h"
+#include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
+#include "lldb/Host/aix/Ptrace.h"
+
+using namespace lldb_private;
+using namespace lldb_private::process_aix;
+
+lldb::ByteOrder NativeRegisterContextAIX::GetByteOrder() const {
+  return m_thread.GetProcess().GetByteOrder();
+}
+
+Status NativeRegisterContextAIX::ReadRegisterRaw(uint32_t reg_index,
+                                                   RegisterValue &reg_value) {
+  const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(reg_index);
+  if (!reg_info)
+    return Status("register %" PRIu32 " not found", reg_index);
+
+  return DoReadRegisterValue(GetPtraceOffset(reg_index), reg_info->name,
+                             reg_info->byte_size, reg_value);
+}
+
+Status
+NativeRegisterContextAIX::WriteRegisterRaw(uint32_t reg_index,
+                                             const RegisterValue &reg_value) {
+  uint32_t reg_to_write = reg_index;
+  RegisterValue value_to_write = reg_value;
+
+  // Check if this is a subregister of a full register.
+  const RegisterInfo *reg_info = GetRegisterInfoAtIndex(reg_index);
+  if (reg_info->invalidate_regs &&
+      (reg_info->invalidate_regs[0] != LLDB_INVALID_REGNUM)) {
+    Status error;
+
+    RegisterValue full_value;
+    uint32_t full_reg = reg_info->invalidate_regs[0];
+    const RegisterInfo *full_reg_info = GetRegisterInfoAtIndex(full_reg);
+
+    // Read the full register.
+    error = ReadRegister(full_reg_info, full_value);
+    if (error.Fail())
+      return error;
+
+    lldb::ByteOrder byte_order = GetByteOrder();
+    uint8_t dst[RegisterValue::kMaxRegisterByteSize];
+
+    // Get the bytes for the full register.
+    const uint32_t dest_size = full_value.GetAsMemoryData(
+        *full_reg_info, dst, sizeof(dst), byte_order, error);
+    if (error.Success() && dest_size) {
+      uint8_t src[RegisterValue::kMaxRegisterByteSize];
+
+      // Get the bytes for the source data.
+      const uint32_t src_size = reg_value.GetAsMemoryData(
+          *reg_info, src, sizeof(src), byte_order, error);
+      if (error.Success() && src_size && (src_size < dest_size)) {
+        // Copy the src bytes to the destination.
+        memcpy(dst + (reg_info->byte_offset & 0x1), src, src_size);
+        // Set this full register as the value to write.
+        value_to_write.SetBytes(dst, full_value.GetByteSize(), byte_order);
+        value_to_write.SetType(*full_reg_info);
+        reg_to_write = full_reg;
+      }
+    }
+  }
+
+  const RegisterInfo *const register_to_write_info_p =
+      GetRegisterInfoAtIndex(reg_to_write);
+  assert(register_to_write_info_p &&
+         "register to write does not have valid RegisterInfo");
+  if (!register_to_write_info_p)
+    return Status("NativeRegisterContextAIX::%s failed to get RegisterInfo "
+                  "for write register index %" PRIu32,
+                  __FUNCTION__, reg_to_write);
+
+  return DoWriteRegisterValue(GetPtraceOffset(reg_index), reg_info->name,
+                              reg_value);
+}
+
+Status NativeRegisterContextAIX::ReadGPR() {
+  return NativeProcessAIX::PtraceWrapper(
+      PTRACE_GETREGS, m_thread.GetID(), nullptr, GetGPRBuffer(), GetGPRSize());
+}
+
+Status NativeRegisterContextAIX::WriteGPR() {
+  return NativeProcessAIX::PtraceWrapper(
+      PTRACE_SETREGS, m_thread.GetID(), nullptr, GetGPRBuffer(), GetGPRSize());
+}
+
+Status NativeRegisterContextAIX::ReadFPR() {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_GETFPREGS, m_thread.GetID(),
+                                           nullptr, GetFPRBuffer(),
+                                           GetFPRSize());
+}
+
+Status NativeRegisterContextAIX::WriteFPR() {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_SETFPREGS, m_thread.GetID(),
+                                           nullptr, GetFPRBuffer(),
+                                           GetFPRSize());
+}
+
+Status NativeRegisterContextAIX::ReadRegisterSet(void *buf, size_t buf_size,
+                                                   unsigned int regset) {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_GETREGSET, m_thread.GetID(),
+                                           static_cast<void *>(&regset), buf,
+                                           buf_size);
+}
+
+Status NativeRegisterContextAIX::WriteRegisterSet(void *buf, size_t buf_size,
+                                                    unsigned int regset) {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_SETREGSET, m_thread.GetID(),
+                                           static_cast<void *>(&regset), buf,
+                                           buf_size);
+}
+
+Status NativeRegisterContextAIX::DoReadRegisterValue(uint32_t offset,
+                                                       const char *reg_name,
+                                                       uint32_t size,
+                                                       RegisterValue &value) {
+  Log *log = GetLog(POSIXLog::Registers);
+
+  long data;
+  Status error = NativeProcessAIX::PtraceWrapper(
+      PTRACE_PEEKUSER, m_thread.GetID(), reinterpret_cast<void *>(offset),
+      nullptr, 0, &data);
+
+  if (error.Success())
+    // First cast to an unsigned of the same size to avoid sign extension.
+    value.SetUInt(static_cast<unsigned long>(data), size);
+
+  LLDB_LOG(log, "{0}: {1:x}", reg_name, data);
+  return error;
+}
+
+Status NativeRegisterContextAIX::DoWriteRegisterValue(
+    uint32_t offset, const char *reg_name, const RegisterValue &value) {
+  Log *log = GetLog(POSIXLog::Registers);
+
+  void *buf = reinterpret_cast<void *>(value.GetAsUInt64());
+  LLDB_LOG(log, "{0}: {1}", reg_name, buf);
+
+  return NativeProcessAIX::PtraceWrapper(
+      PTRACE_POKEUSER, m_thread.GetID(), reinterpret_cast<void *>(offset), buf);
+}
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h
new file mode 100644
index 0000000000000..9c2a326856c0b
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.h
@@ -0,0 +1,133 @@
+//===-- NativeRegisterContextAIX.h ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef lldb_NativeRegisterContextAIX_h
+#define lldb_NativeRegisterContextAIX_h
+
+#include "Plugins/Process/Utility/NativeRegisterContextRegisterInfo.h"
+#include "lldb/Host/common/NativeThreadProtocol.h"
+#include "lldb/Target/MemoryTagManager.h"
+#include "llvm/Support/Error.h"
+
+namespace lldb_private {
+namespace process_aix {
+
+class NativeThreadAIX;
+
+class NativeRegisterContextAIX
+    : public virtual NativeRegisterContextRegisterInfo {
+public:
+  // This function is implemented in the NativeRegisterContextAIX_* subclasses
+  // to create a new instance of the host specific NativeRegisterContextAIX.
+  // The implementations can't collide as only one NativeRegisterContextAIX_*
+  // variant should be compiled into the final executable.
+  static std::unique_ptr<NativeRegisterContextAIX>
+  CreateHostNativeRegisterContextAIX(const ArchSpec &target_arch,
+                                       NativeThreadAIX &native_thread);
+
+  // Invalidates cached values in register context data structures
+  virtual void InvalidateAllRegisters(){}
+
+  struct SyscallData {
+    /// The syscall instruction. If the architecture uses software
+    /// single-stepping, the instruction should also be followed by a trap to
+    /// ensure the process is stopped after the syscall.
+    llvm::ArrayRef<uint8_t> Insn;
+
+    /// Registers used for syscall arguments. The first register is used to
+    /// store the syscall number.
+    llvm::ArrayRef<uint32_t> Args;
+
+    uint32_t Result; ///< Register containing the syscall result.
+  };
+  /// Return architecture-specific data needed to make inferior syscalls, if
+  /// they are supported.
+  virtual std::optional<SyscallData> GetSyscallData() { return std::nullopt; }
+
+  struct MmapData {
+    // Syscall numbers can be found (e.g.) in /usr/include/asm/unistd.h for the
+    // relevant architecture.
+    unsigned SysMmap;   ///< mmap syscall number.
+    unsigned SysMunmap; ///< munmap syscall number
+  };
+  /// Return the architecture-specific data needed to make mmap syscalls, if
+  /// they are supported.
+  virtual std::optional<MmapData> GetMmapData() { return std::nullopt; }
+
+  struct MemoryTaggingDetails {
+    /// Object with tag handling utilities. If the function below returns
+    /// a valid structure, you can assume that this pointer is valid.
+    std::unique_ptr<MemoryTagManager> manager;
+    int ptrace_read_req;  /// ptrace operation number for memory tag read
+    int ptrace_write_req; /// ptrace operation number for memory tag write
+  };
+  /// Return architecture specific data needed to use memory tags,
+  /// if they are supported.
+  virtual llvm::Expected<MemoryTaggingDetails>
+  GetMemoryTaggingDetails(int32_t type) {
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "Architecture does not support memory tagging");
+  }
+
+protected:
+  // NB: This constructor is here only because gcc<=6.5 requires a virtual base
+  // class initializer on abstract class (even though it is never used). It can
+  // be deleted once we move to gcc>=7.0.
+  NativeRegisterContextAIX(NativeThreadProtocol &thread)
+      : NativeRegisterContextRegisterInfo(thread, nullptr) {}
+
+  lldb::ByteOrder GetByteOrder() const;
+
+  virtual Status ReadRegisterRaw(uint32_t reg_index, RegisterValue &reg_value);
+
+  virtual Status WriteRegisterRaw(uint32_t reg_index,
+                                  const RegisterValue &reg_value);
+
+  virtual Status ReadRegisterSet(void *buf, size_t buf_size,
+                                 unsigned int regset);
+
+  virtual Status WriteRegisterSet(void *buf, size_t buf_size,
+                                  unsigned int regset);
+
+  virtual Status ReadGPR();
+
+  virtual Status WriteGPR();
+
+  virtual Status ReadFPR();
+
+  virtual Status WriteFPR();
+
+  virtual void *GetGPRBuffer() = 0;
+
+  virtual size_t GetGPRSize() const {
+    return GetRegisterInfoInterface().GetGPRSize();
+  }
+
+  virtual void *GetFPRBuffer() = 0;
+
+  virtual size_t GetFPRSize() = 0;
+
+  virtual uint32_t GetPtraceOffset(uint32_t reg_index) {
+    return GetRegisterInfoAtIndex(reg_index)->byte_offset;
+  }
+
+  // The Do*** functions are executed on the privileged thread and can perform
+  // ptrace
+  // operations directly.
+  virtual Status DoReadRegisterValue(uint32_t offset, const char *reg_name,
+                                     uint32_t size, RegisterValue &value);
+
+  virtual Status DoWriteRegisterValue(uint32_t offset, const char *reg_name,
+                                      const RegisterValue &value);
+};
+
+} // namespace process_aix
+} // namespace lldb_private
+
+#endif // #ifndef lldb_NativeRegisterContextAIX_h
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
new file mode 100644
index 0000000000000..1996373791748
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
@@ -0,0 +1,744 @@
+//===-- NativeRegisterContextAIX_ppc64.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This implementation is related to the OpenPOWER ABI for Power Architecture
+// 64-bit ELF V2 ABI
+
+#if defined(__powerpc64__)
+
+#include "NativeRegisterContextAIX_ppc64.h"
+
+#include "lldb/Host/common/NativeProcessProtocol.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/RegisterValue.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Host/aix/Ptrace.h"
+
+#include "Plugins/Process/AIX/NativeProcessAIX.h"
+#include "Plugins/Process/Linux/Procfs.h"
+#include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
+#include "Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.h"
+
+// System includes - They have to be included after framework includes because
+// they define some macros which collide with variable names in other modules
+#include <sys/socket.h>
+#include <sys/reg.h>
+#include <sys/ptrace.h>
+#include <sys/ldr.h>
+
+#define REG_CONTEXT_SIZE                                                       \
+  (GetGPRSize() + GetFPRSize() + sizeof(m_vmx_ppc64le) + sizeof(m_vsx_ppc64le))
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::process_aix;
+
+static const uint32_t g_gpr_regnums_ppc64le[] = {
+    gpr_r0_ppc64le,   gpr_r1_ppc64le,  gpr_r2_ppc64le,     gpr_r3_ppc64le,
+    gpr_r4_ppc64le,   gpr_r5_ppc64le,  gpr_r6_ppc64le,     gpr_r7_ppc64le,
+    gpr_r8_ppc64le,   gpr_r9_ppc64le,  gpr_r10_ppc64le,    gpr_r11_ppc64le,
+    gpr_r12_ppc64le,  gpr_r13_ppc64le, gpr_r14_ppc64le,    gpr_r15_ppc64le,
+    gpr_r16_ppc64le,  gpr_r17_ppc64le, gpr_r18_ppc64le,    gpr_r19_ppc64le,
+    gpr_r20_ppc64le,  gpr_r21_ppc64le, gpr_r22_ppc64le,    gpr_r23_ppc64le,
+    gpr_r24_ppc64le,  gpr_r25_ppc64le, gpr_r26_ppc64le,    gpr_r27_ppc64le,
+    gpr_r28_ppc64le,  gpr_r29_ppc64le, gpr_r30_ppc64le,    gpr_r31_ppc64le,
+    gpr_pc_ppc64le,   gpr_msr_ppc64le, gpr_origr3_ppc64le, gpr_ctr_ppc64le,
+    gpr_lr_ppc64le,   gpr_xer_ppc64le, gpr_cr_ppc64le,     gpr_softe_ppc64le,
+    gpr_trap_ppc64le,
+    LLDB_INVALID_REGNUM // register sets need to end with this flag
+};
+
+static const uint32_t g_fpr_regnums_ppc64le[] = {
+    fpr_f0_ppc64le,    fpr_f1_ppc64le,  fpr_f2_ppc64le,  fpr_f3_ppc64le,
+    fpr_f4_ppc64le,    fpr_f5_ppc64le,  fpr_f6_ppc64le,  fpr_f7_ppc64le,
+    fpr_f8_ppc64le,    fpr_f9_ppc64le,  fpr_f10_ppc64le, fpr_f11_ppc64le,
+    fpr_f12_ppc64le,   fpr_f13_ppc64le, fpr_f14_ppc64le, fpr_f15_ppc64le,
+    fpr_f16_ppc64le,   fpr_f17_ppc64le, fpr_f18_ppc64le, fpr_f19_ppc64le,
+    fpr_f20_ppc64le,   fpr_f21_ppc64le, fpr_f22_ppc64le, fpr_f23_ppc64le,
+    fpr_f24_ppc64le,   fpr_f25_ppc64le, fpr_f26_ppc64le, fpr_f27_ppc64le,
+    fpr_f28_ppc64le,   fpr_f29_ppc64le, fpr_f30_ppc64le, fpr_f31_ppc64le,
+    fpr_fpscr_ppc64le,
+    LLDB_INVALID_REGNUM // register sets need to end with this flag
+};
+
+static const uint32_t g_vmx_regnums_ppc64le[] = {
+    vmx_vr0_ppc64le,  vmx_vr1_ppc64le,    vmx_vr2_ppc64le,  vmx_vr3_ppc64le,
+    vmx_vr4_ppc64le,  vmx_vr5_ppc64le,    vmx_vr6_ppc64le,  vmx_vr7_ppc64le,
+    vmx_vr8_ppc64le,  vmx_vr9_ppc64le,    vmx_vr10_ppc64le, vmx_vr11_ppc64le,
+    vmx_vr12_ppc64le, vmx_vr13_ppc64le,   vmx_vr14_ppc64le, vmx_vr15_ppc64le,
+    vmx_vr16_ppc64le, vmx_vr17_ppc64le,   vmx_vr18_ppc64le, vmx_vr19_ppc64le,
+    vmx_vr20_ppc64le, vmx_vr21_ppc64le,   vmx_vr22_ppc64le, vmx_vr23_ppc64le,
+    vmx_vr24_ppc64le, vmx_vr25_ppc64le,   vmx_vr26_ppc64le, vmx_vr27_ppc64le,
+    vmx_vr28_ppc64le, vmx_vr29_ppc64le,   vmx_vr30_ppc64le, vmx_vr31_ppc64le,
+    vmx_vscr_ppc64le, vmx_vrsave_ppc64le,
+    LLDB_INVALID_REGNUM // register sets need to end with this flag
+};
+
+static const uint32_t g_vsx_regnums_ppc64le[] = {
+    vsx_vs0_ppc64le,  vsx_vs1_ppc64le,  vsx_vs2_ppc64le,  vsx_vs3_ppc64le,
+    vsx_vs4_ppc64le,  vsx_vs5_ppc64le,  vsx_vs6_ppc64le,  vsx_vs7_ppc64le,
+    vsx_vs8_ppc64le,  vsx_vs9_ppc64le,  vsx_vs10_ppc64le, vsx_vs11_ppc64le,
+    vsx_vs12_ppc64le, vsx_vs13_ppc64le, vsx_vs14_ppc64le, vsx_vs15_ppc64le,
+    vsx_vs16_ppc64le, vsx_vs17_ppc64le, vsx_vs18_ppc64le, vsx_vs19_ppc64le,
+    vsx_vs20_ppc64le, vsx_vs21_ppc64le, vsx_vs22_ppc64le, vsx_vs23_ppc64le,
+    vsx_vs24_ppc64le, vsx_vs25_ppc64le, vsx_vs26_ppc64le, vsx_vs27_ppc64le,
+    vsx_vs28_ppc64le, vsx_vs29_ppc64le, vsx_vs30_ppc64le, vsx_vs31_ppc64le,
+    vsx_vs32_ppc64le, vsx_vs33_ppc64le, vsx_vs34_ppc64le, vsx_vs35_ppc64le,
+    vsx_vs36_ppc64le, vsx_vs37_ppc64le, vsx_vs38_ppc64le, vsx_vs39_ppc64le,
+    vsx_vs40_ppc64le, vsx_vs41_ppc64le, vsx_vs42_ppc64le, vsx_vs43_ppc64le,
+    vsx_vs44_ppc64le, vsx_vs45_ppc64le, vsx_vs46_ppc64le, vsx_vs47_ppc64le,
+    vsx_vs48_ppc64le, vsx_vs49_ppc64le, vsx_vs50_ppc64le, vsx_vs51_ppc64le,
+    vsx_vs52_ppc64le, vsx_vs53_ppc64le, vsx_vs54_ppc64le, vsx_vs55_ppc64le,
+    vsx_vs56_ppc64le, vsx_vs57_ppc64le, vsx_vs58_ppc64le, vsx_vs59_ppc64le,
+    vsx_vs60_ppc64le, vsx_vs61_ppc64le, vsx_vs62_ppc64le, vsx_vs63_ppc64le,
+    LLDB_INVALID_REGNUM // register sets need to end with this flag
+};
+
+// Number of register sets provided by this context.
+static constexpr int k_num_register_sets = 4;
+
+static const RegisterSet g_reg_sets_ppc64le[k_num_register_sets] = {
+    {"General Purpose Registers", "gpr", k_num_gpr_registers_ppc64le,
+     g_gpr_regnums_ppc64le},
+    {"Floating Point Registers", "fpr", k_num_fpr_registers_ppc64le,
+     g_fpr_regnums_ppc64le},
+    {"AltiVec/VMX Registers", "vmx", k_num_vmx_registers_ppc64le,
+     g_vmx_regnums_ppc64le},
+    {"VSX Registers", "vsx", k_num_vsx_registers_ppc64le,
+     g_vsx_regnums_ppc64le},
+};
+
+std::unique_ptr<NativeRegisterContextAIX>
+NativeRegisterContextAIX::CreateHostNativeRegisterContextAIX(
+    const ArchSpec &target_arch, NativeThreadAIX &native_thread) {
+  switch (target_arch.GetMachine()) {
+  case llvm::Triple::ppc64:
+    return std::make_unique<NativeRegisterContextAIX_ppc64>(target_arch,
+                                                                 native_thread);
+  default:
+    llvm_unreachable("have no register context for architecture");
+  }
+}
+
+NativeRegisterContextAIX_ppc64::NativeRegisterContextAIX_ppc64(
+    const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
+    : NativeRegisterContextRegisterInfo(
+          native_thread, new RegisterInfoPOSIX_ppc64le(target_arch)),
+      NativeRegisterContextAIX(native_thread) {
+  if (target_arch.GetMachine() != llvm::Triple::ppc64) {
+    llvm_unreachable("Unhandled target architecture.");
+  }
+
+  ::memset(&m_gpr_ppc64le, 0, sizeof(m_gpr_ppc64le));
+  ::memset(&m_fpr_ppc64le, 0, sizeof(m_fpr_ppc64le));
+  ::memset(&m_vmx_ppc64le, 0, sizeof(m_vmx_ppc64le));
+  ::memset(&m_vsx_ppc64le, 0, sizeof(m_vsx_ppc64le));
+  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::GetRegisterSetCount() const {
+  return k_num_register_sets;
+}
+
+const RegisterSet *
+NativeRegisterContextAIX_ppc64::GetRegisterSet(uint32_t set_index) const {
+  if (set_index < k_num_register_sets)
+    return &g_reg_sets_ppc64le[set_index];
+
+  return nullptr;
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::GetUserRegisterCount() const {
+  uint32_t count = 0;
+  for (uint32_t set_index = 0; set_index < k_num_register_sets; ++set_index)
+    count += g_reg_sets_ppc64le[set_index].num_registers;
+  return count;
+}
+
+Status NativeRegisterContextAIX_ppc64::ReadRegister(
+    const RegisterInfo *reg_info, RegisterValue &reg_value) {
+  Status error;
+
+  if (!reg_info) {
+    error.SetErrorString("reg_info NULL");
+    return error;
+  }
+
+  const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
+
+  if (IsFPR(reg)) {
+    error = ReadFPR();
+    if (error.Fail())
+      return error;
+
+    // Get pointer to m_fpr_ppc64le variable and set the data from it.
+    uint32_t fpr_offset = CalculateFprOffset(reg_info);
+    assert(fpr_offset < sizeof m_fpr_ppc64le);
+    uint8_t *src = (uint8_t *)&m_fpr_ppc64le + fpr_offset;
+    reg_value.SetFromMemoryData(*reg_info, src, reg_info->byte_size,
+                                eByteOrderLittle, error);
+  } else if (IsVSX(reg)) {
+    uint32_t vsx_offset = CalculateVsxOffset(reg_info);
+    assert(vsx_offset < sizeof(m_vsx_ppc64le));
+
+    if (vsx_offset < sizeof(m_vsx_ppc64le) / 2) {
+      error = ReadVSX();
+      if (error.Fail())
+        return error;
+
+      error = ReadFPR();
+      if (error.Fail())
+        return error;
+
+      uint64_t value[2];
+      uint8_t *dst, *src;
+      dst = (uint8_t *)&value;
+      src = (uint8_t *)&m_vsx_ppc64le + vsx_offset / 2;
+      ::memcpy(dst, src, 8);
+      dst += 8;
+      src = (uint8_t *)&m_fpr_ppc64le + vsx_offset / 2;
+      ::memcpy(dst, src, 8);
+      reg_value.SetFromMemoryData(*reg_info, &value, reg_info->byte_size,
+                                  eByteOrderLittle, error);
+    } else {
+      error = ReadVMX();
+      if (error.Fail())
+        return error;
+
+      // Get pointer to m_vmx_ppc64le variable and set the data from it.
+      uint32_t vmx_offset = vsx_offset - sizeof(m_vsx_ppc64le) / 2;
+      uint8_t *src = (uint8_t *)&m_vmx_ppc64le + vmx_offset;
+      reg_value.SetFromMemoryData(*reg_info, src, reg_info->byte_size,
+                                  eByteOrderLittle, error);
+    }
+  } else if (IsVMX(reg)) {
+    error = ReadVMX();
+    if (error.Fail())
+      return error;
+
+    // Get pointer to m_vmx_ppc64le variable and set the data from it.
+    uint32_t vmx_offset = CalculateVmxOffset(reg_info);
+    assert(vmx_offset < sizeof m_vmx_ppc64le);
+    uint8_t *src = (uint8_t *)&m_vmx_ppc64le + vmx_offset;
+    reg_value.SetFromMemoryData(*reg_info, src, reg_info->byte_size,
+                                eByteOrderLittle, error);
+  } else if (IsGPR(reg)) {
+    error = ReadGPR();
+    if (error.Fail())
+      return error;
+
+    uint8_t *src = (uint8_t *) &m_gpr_ppc64le + reg_info->byte_offset;
+    reg_value.SetFromMemoryData(*reg_info, src, reg_info->byte_size,
+                                eByteOrderLittle, error);
+  } else {
+    return Status("failed - register wasn't recognized to be a GPR, FPR, VSX "
+                  "or VMX, read strategy unknown");
+  }
+
+  return error;
+}
+
+Status NativeRegisterContextAIX_ppc64::WriteRegister(
+    const RegisterInfo *reg_info, const RegisterValue &reg_value) {
+  Status error;
+  if (!reg_info)
+    return Status("reg_info NULL");
+
+  const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
+  if (reg_index == LLDB_INVALID_REGNUM)
+    return Status("no lldb regnum for %s", reg_info && reg_info->name
+                                               ? reg_info->name
+                                               : "<unknown register>");
+
+  if (IsGPR(reg_index)) {
+    error = ReadGPR();
+    if (error.Fail())
+      return error;
+
+    uint8_t *dst = (uint8_t *)&m_gpr_ppc64le + reg_info->byte_offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_value.GetByteSize());
+    *(uint64_t *)dst = llvm::byteswap<uint64_t>(*(uint64_t *)dst);
+
+    error = WriteGPR();
+    if (error.Fail())
+      return error;
+
+    return Status();
+  }
+
+  if (IsFPR(reg_index)) {
+    error = ReadFPR();
+    if (error.Fail())
+      return error;
+
+    // Get pointer to m_fpr_ppc64le variable and set the data to it.
+    uint32_t fpr_offset = CalculateFprOffset(reg_info);
+    assert(fpr_offset < GetFPRSize());
+    uint8_t *dst = (uint8_t *)&m_fpr_ppc64le + fpr_offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_value.GetByteSize());
+
+    error = WriteFPR();
+    if (error.Fail())
+      return error;
+
+    return Status();
+  }
+
+  if (IsVMX(reg_index)) {
+    error = ReadVMX();
+    if (error.Fail())
+      return error;
+
+    // Get pointer to m_vmx_ppc64le variable and set the data to it.
+    uint32_t vmx_offset = CalculateVmxOffset(reg_info);
+    assert(vmx_offset < sizeof(m_vmx_ppc64le));
+    uint8_t *dst = (uint8_t *)&m_vmx_ppc64le + vmx_offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_value.GetByteSize());
+
+    error = WriteVMX();
+    if (error.Fail())
+      return error;
+
+    return Status();
+  }
+
+  if (IsVSX(reg_index)) {
+    uint32_t vsx_offset = CalculateVsxOffset(reg_info);
+    assert(vsx_offset < sizeof(m_vsx_ppc64le));
+
+    if (vsx_offset < sizeof(m_vsx_ppc64le) / 2) {
+      error = ReadVSX();
+      if (error.Fail())
+        return error;
+
+      error = ReadFPR();
+      if (error.Fail())
+        return error;
+
+      uint64_t value[2];
+      ::memcpy(value, reg_value.GetBytes(), 16);
+      uint8_t *dst, *src;
+      src = (uint8_t *)value;
+      dst = (uint8_t *)&m_vsx_ppc64le + vsx_offset / 2;
+      ::memcpy(dst, src, 8);
+      src += 8;
+      dst = (uint8_t *)&m_fpr_ppc64le + vsx_offset / 2;
+      ::memcpy(dst, src, 8);
+
+      WriteVSX();
+      WriteFPR();
+    } else {
+      error = ReadVMX();
+      if (error.Fail())
+        return error;
+
+      // Get pointer to m_vmx_ppc64le variable and set the data from it.
+      uint32_t vmx_offset = vsx_offset - sizeof(m_vsx_ppc64le) / 2;
+      uint8_t *dst = (uint8_t *)&m_vmx_ppc64le + vmx_offset;
+      ::memcpy(dst, reg_value.GetBytes(), reg_value.GetByteSize());
+      WriteVMX();
+    }
+
+    return Status();
+  }
+
+  return Status("failed - register wasn't recognized to be a GPR, FPR, VSX "
+                "or VMX, write strategy unknown");
+}
+
+Status NativeRegisterContextAIX_ppc64::ReadAllRegisterValues(
+    lldb::WritableDataBufferSP &data_sp) {
+  Status error;
+
+  data_sp.reset(new DataBufferHeap(REG_CONTEXT_SIZE, 0));
+  error = ReadGPR();
+  if (error.Fail())
+    return error;
+
+  error = ReadFPR();
+  if (error.Fail())
+    return error;
+
+  error = ReadVMX();
+  if (error.Fail())
+    return error;
+
+  error = ReadVSX();
+  if (error.Fail())
+    return error;
+
+  uint8_t *dst = data_sp->GetBytes();
+  ::memcpy(dst, &m_gpr_ppc64le, GetGPRSize());
+  dst += GetGPRSize();
+  ::memcpy(dst, &m_fpr_ppc64le, GetFPRSize());
+  dst += GetFPRSize();
+  ::memcpy(dst, &m_vmx_ppc64le, sizeof(m_vmx_ppc64le));
+  dst += sizeof(m_vmx_ppc64le);
+  ::memcpy(dst, &m_vsx_ppc64le, sizeof(m_vsx_ppc64le));
+
+  return error;
+}
+
+Status NativeRegisterContextAIX_ppc64::WriteAllRegisterValues(
+    const lldb::DataBufferSP &data_sp) {
+  Status error;
+
+  if (!data_sp) {
+    error.SetErrorStringWithFormat(
+        "NativeRegisterContextAIX_ppc64::%s invalid data_sp provided",
+        __FUNCTION__);
+    return error;
+  }
+
+  if (data_sp->GetByteSize() != REG_CONTEXT_SIZE) {
+    error.SetErrorStringWithFormat(
+        "NativeRegisterContextAIX_ppc64::%s data_sp contained mismatched "
+        "data size, expected %" PRIu64 ", actual %" PRIu64,
+        __FUNCTION__, REG_CONTEXT_SIZE, data_sp->GetByteSize());
+    return error;
+  }
+
+  const uint8_t *src = data_sp->GetBytes();
+  if (src == nullptr) {
+    error.SetErrorStringWithFormat("NativeRegisterContextAIX_ppc64::%s "
+                                   "DataBuffer::GetBytes() returned a null "
+                                   "pointer",
+                                   __FUNCTION__);
+    return error;
+  }
+
+  ::memcpy(&m_gpr_ppc64le, src, GetGPRSize());
+  error = WriteGPR();
+
+  if (error.Fail())
+    return error;
+
+  src += GetGPRSize();
+  ::memcpy(&m_fpr_ppc64le, src, GetFPRSize());
+
+  error = WriteFPR();
+  if (error.Fail())
+    return error;
+
+  src += GetFPRSize();
+  ::memcpy(&m_vmx_ppc64le, src, sizeof(m_vmx_ppc64le));
+
+  error = WriteVMX();
+  if (error.Fail())
+    return error;
+
+  src += sizeof(m_vmx_ppc64le);
+  ::memcpy(&m_vsx_ppc64le, src, sizeof(m_vsx_ppc64le));
+  error = WriteVSX();
+
+  return error;
+}
+
+bool NativeRegisterContextAIX_ppc64::IsGPR(unsigned reg) const {
+  return reg <= k_last_gpr_ppc64le; // GPR's come first.
+}
+
+bool NativeRegisterContextAIX_ppc64::IsFPR(unsigned reg) const {
+  return (k_first_fpr_ppc64le <= reg && reg <= k_last_fpr_ppc64le);
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::CalculateFprOffset(
+    const RegisterInfo *reg_info) const {
+  return reg_info->byte_offset -
+         GetRegisterInfoAtIndex(k_first_fpr_ppc64le)->byte_offset;
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::CalculateVmxOffset(
+    const RegisterInfo *reg_info) const {
+  return reg_info->byte_offset -
+         GetRegisterInfoAtIndex(k_first_vmx_ppc64le)->byte_offset;
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::CalculateVsxOffset(
+    const RegisterInfo *reg_info) const {
+  return reg_info->byte_offset -
+         GetRegisterInfoAtIndex(k_first_vsx_ppc64le)->byte_offset;
+}
+
+Status NativeRegisterContextAIX_ppc64::ReadVMX() {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_GETVRREGS, m_thread.GetID(),
+                                           nullptr, &m_vmx_ppc64le,
+                                           sizeof(m_vmx_ppc64le));
+}
+
+Status NativeRegisterContextAIX_ppc64::WriteVMX() {
+  //FIXME
+  int regset = 0/*NT_PPC_VMX*/;
+  return NativeProcessAIX::PtraceWrapper(PT_CLEAR/*PTRACE_SETVRREGS*/, m_thread.GetID(),
+                                           &regset, &m_vmx_ppc64le,
+                                           sizeof(m_vmx_ppc64le));
+}
+
+Status NativeRegisterContextAIX_ppc64::ReadVSX() {
+  return NativeProcessAIX::PtraceWrapper(PTRACE_GETVSRREGS, m_thread.GetID(),
+                                           nullptr, &m_vsx_ppc64le,
+                                           sizeof(m_vsx_ppc64le));
+}
+
+Status NativeRegisterContextAIX_ppc64::WriteVSX() {
+  //FIXME
+  int regset = 0/*NT_PPC_VSX*/;
+  return NativeProcessAIX::PtraceWrapper(PT_CLEAR/*PTRACE_SETVSRREGS*/, m_thread.GetID(),
+                                           &regset, &m_vsx_ppc64le,
+                                           sizeof(m_vsx_ppc64le));
+}
+
+bool NativeRegisterContextAIX_ppc64::IsVMX(unsigned reg) {
+  return (reg >= k_first_vmx_ppc64le) && (reg <= k_last_vmx_ppc64le);
+}
+
+bool NativeRegisterContextAIX_ppc64::IsVSX(unsigned reg) {
+  return (reg >= k_first_vsx_ppc64le) && (reg <= k_last_vsx_ppc64le);
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::NumSupportedHardwareWatchpoints() {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+
+  // Read hardware breakpoint and watchpoint information.
+  Status error = ReadHardwareDebugInfo();
+
+  if (error.Fail())
+    return 0;
+
+  LLDB_LOG(log, "{0}", m_max_hwp_supported);
+  return m_max_hwp_supported;
+}
+
+uint32_t NativeRegisterContextAIX_ppc64::SetHardwareWatchpoint(
+    lldb::addr_t addr, size_t size, uint32_t watch_flags) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "addr: {0:x}, size: {1:x} watch_flags: {2:x}", addr, size,
+           watch_flags);
+
+  // Read hardware breakpoint and watchpoint information.
+  Status error = ReadHardwareDebugInfo();
+
+  if (error.Fail())
+    return LLDB_INVALID_INDEX32;
+
+  uint32_t control_value = 0, wp_index = 0;
+  lldb::addr_t real_addr = addr;
+  uint32_t rw_mode = 0;
+
+  // Check if we are setting watchpoint other than read/write/access Update
+  // watchpoint flag to match ppc64le write-read bit configuration.
+  switch (watch_flags) {
+  case eWatchpointKindWrite:
+    //FIXME
+    //rw_mode = 0/*PPC_BREAKPOINT_TRIGGER_WRITE*/;
+    watch_flags = 2;
+    break;
+  // Watchpoint read not supported
+  case eWatchpointKindRead:
+  case (eWatchpointKindRead | eWatchpointKindWrite):
+  default:
+    return LLDB_INVALID_INDEX32;
+  }
+
+  // Check if size has a valid hardware watchpoint length.
+  if (size != 8)
+    return LLDB_INVALID_INDEX32;
+
+  // Check 8-byte alignment for hardware watchpoint target address. Below is a
+  // hack to recalculate address and size in order to make sure we can watch
+  // non 8-byte aligned addresses as well.
+  if (addr & 0x07) {
+
+    addr_t begin = llvm::alignDown(addr, 8);
+    addr_t end = llvm::alignTo(addr + size, 8);
+    size = llvm::PowerOf2Ceil(end - begin);
+
+    addr = addr & (~0x07);
+  }
+
+  // Setup control value
+  control_value = watch_flags << 3;
+  control_value |= ((1 << size) - 1) << 5;
+  control_value |= (2 << 1) | 1;
+
+  // Iterate over stored watchpoints and find a free wp_index
+  wp_index = LLDB_INVALID_INDEX32;
+  for (uint32_t i = 0; i < m_max_hwp_supported; i++) {
+    if ((m_hwp_regs[i].control & 1) == 0) {
+      wp_index = i; // Mark last free slot
+    } else if (m_hwp_regs[i].address == addr) {
+      return LLDB_INVALID_INDEX32; // We do not support duplicate watchpoints.
+    }
+  }
+
+  if (wp_index == LLDB_INVALID_INDEX32)
+    return LLDB_INVALID_INDEX32;
+
+  // Update watchpoint in local cache
+  m_hwp_regs[wp_index].real_addr = real_addr;
+  m_hwp_regs[wp_index].address = addr;
+  m_hwp_regs[wp_index].control = control_value;
+  //m_hwp_regs[wp_index].mode = rw_mode;
+
+  // PTRACE call to set corresponding watchpoint register.
+  error = WriteHardwareDebugRegs();
+
+  if (error.Fail()) {
+    m_hwp_regs[wp_index].address = 0;
+    m_hwp_regs[wp_index].control &= llvm::maskTrailingZeros<uint32_t>(1);
+
+    return LLDB_INVALID_INDEX32;
+  }
+
+  return wp_index;
+}
+
+bool NativeRegisterContextAIX_ppc64::ClearHardwareWatchpoint(
+    uint32_t wp_index) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}", wp_index);
+
+  // Read hardware breakpoint and watchpoint information.
+  Status error = ReadHardwareDebugInfo();
+
+  if (error.Fail())
+    return false;
+
+  if (wp_index >= m_max_hwp_supported)
+    return false;
+
+  // Create a backup we can revert to in case of failure.
+  lldb::addr_t tempAddr = m_hwp_regs[wp_index].address;
+  uint32_t tempControl = m_hwp_regs[wp_index].control;
+  long *tempSlot = reinterpret_cast<long *>(m_hwp_regs[wp_index].slot);
+
+  // Update watchpoint in local cache
+  m_hwp_regs[wp_index].control &= llvm::maskTrailingZeros<uint32_t>(1);
+  m_hwp_regs[wp_index].address = 0;
+  m_hwp_regs[wp_index].slot = 0;
+  m_hwp_regs[wp_index].mode = 0;
+
+  // Ptrace call to update hardware debug registers
+  //FIXME
+  error = NativeProcessAIX::PtraceWrapper(PT_CLEAR/*PPC_PTRACE_DELHWDEBUG*/,
+                                            m_thread.GetID(), 0, tempSlot);
+
+  if (error.Fail()) {
+    m_hwp_regs[wp_index].control = tempControl;
+    m_hwp_regs[wp_index].address = tempAddr;
+    m_hwp_regs[wp_index].slot = reinterpret_cast<long>(tempSlot);
+
+    return false;
+  }
+
+  return true;
+}
+
+uint32_t
+NativeRegisterContextAIX_ppc64::GetWatchpointSize(uint32_t wp_index) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}", wp_index);
+
+  unsigned control = (m_hwp_regs[wp_index].control >> 5) & 0xff;
+  if (llvm::isPowerOf2_32(control + 1)) {
+    return llvm::popcount(control);
+  }
+
+  return 0;
+}
+
+bool NativeRegisterContextAIX_ppc64::WatchpointIsEnabled(
+    uint32_t wp_index) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}", wp_index);
+
+  return !!((m_hwp_regs[wp_index].control & 0x1) == 0x1);
+}
+
+Status NativeRegisterContextAIX_ppc64::GetWatchpointHitIndex(
+    uint32_t &wp_index, lldb::addr_t trap_addr) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}, trap_addr: {1:x}", wp_index, trap_addr);
+
+  uint32_t watch_size;
+  lldb::addr_t watch_addr;
+
+  for (wp_index = 0; wp_index < m_max_hwp_supported; ++wp_index) {
+    watch_size = GetWatchpointSize(wp_index);
+    watch_addr = m_hwp_regs[wp_index].address;
+
+    if (WatchpointIsEnabled(wp_index) && trap_addr >= watch_addr &&
+        trap_addr <= watch_addr + watch_size) {
+      m_hwp_regs[wp_index].hit_addr = trap_addr;
+      return Status();
+    }
+  }
+
+  wp_index = LLDB_INVALID_INDEX32;
+  return Status();
+}
+
+lldb::addr_t
+NativeRegisterContextAIX_ppc64::GetWatchpointAddress(uint32_t wp_index) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}", wp_index);
+
+  if (wp_index >= m_max_hwp_supported)
+    return LLDB_INVALID_ADDRESS;
+
+  if (WatchpointIsEnabled(wp_index))
+    return m_hwp_regs[wp_index].real_addr;
+  else
+    return LLDB_INVALID_ADDRESS;
+}
+
+lldb::addr_t
+NativeRegisterContextAIX_ppc64::GetWatchpointHitAddress(uint32_t wp_index) {
+  Log *log = GetLog(POSIXLog::Watchpoints);
+  LLDB_LOG(log, "wp_index: {0}", wp_index);
+
+  if (wp_index >= m_max_hwp_supported)
+    return LLDB_INVALID_ADDRESS;
+
+  if (WatchpointIsEnabled(wp_index))
+    return m_hwp_regs[wp_index].hit_addr;
+
+  return LLDB_INVALID_ADDRESS;
+}
+
+Status NativeRegisterContextAIX_ppc64::ReadHardwareDebugInfo() {
+  if (!m_refresh_hwdebug_info) {
+    return Status();
+  }
+
+  m_max_hwp_supported = 1;
+  m_max_hbp_supported = 0;
+  m_refresh_hwdebug_info = false;
+
+  return Status();
+}
+
+Status NativeRegisterContextAIX_ppc64::WriteHardwareDebugRegs() {
+  Status error;
+  long ret;
+
+  for (uint32_t i = 0; i < m_max_hwp_supported; i++) {
+    if ((m_hwp_regs[i].control & 1) == 0)
+      continue;
+
+    error = NativeProcessAIX::PtraceWrapper(PT_WATCH, m_thread.GetID(), (void *)m_hwp_regs[i].address, nullptr, 8, &ret);
+
+    if (error.Fail())
+      return error;
+
+    m_hwp_regs[i].slot = ret;
+  }
+  return error;
+}
+
+#endif // defined(__powerpc64__)
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.h b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.h
new file mode 100644
index 0000000000000..a29f786f2313a
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.h
@@ -0,0 +1,138 @@
+//===-- NativeRegisterContextAIX_ppc64.h --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This implementation is related to the OpenPOWER ABI for Power Architecture
+// 64-bit ELF V2 ABI
+
+#if defined(__powerpc64__)
+
+#ifndef lldb_NativeRegisterContextAIX_ppc64_h
+#define lldb_NativeRegisterContextAIX_ppc64_h
+
+#include "Plugins/Process/AIX/NativeRegisterContextAIX.h"
+#include "Plugins/Process/Utility/lldb-ppc64le-register-enums.h"
+
+#define DECLARE_REGISTER_INFOS_PPC64LE_STRUCT
+#include "Plugins/Process/Utility/RegisterInfos_ppc64le.h"
+#undef DECLARE_REGISTER_INFOS_PPC64LE_STRUCT
+
+namespace lldb_private {
+namespace process_aix {
+
+class NativeProcessAIX;
+
+class NativeRegisterContextAIX_ppc64 : public NativeRegisterContextAIX {
+public:
+  NativeRegisterContextAIX_ppc64(const ArchSpec &target_arch,
+                                     NativeThreadProtocol &native_thread);
+
+  uint32_t GetRegisterSetCount() const override;
+
+  uint32_t GetUserRegisterCount() const override;
+
+  const RegisterSet *GetRegisterSet(uint32_t set_index) const override;
+
+  Status ReadRegister(const RegisterInfo *reg_info,
+                      RegisterValue &reg_value) override;
+
+  Status WriteRegister(const RegisterInfo *reg_info,
+                       const RegisterValue &reg_value) override;
+
+  Status ReadAllRegisterValues(lldb::WritableDataBufferSP &data_sp) override;
+
+  Status WriteAllRegisterValues(const lldb::DataBufferSP &data_sp) override;
+
+  // Hardware watchpoint management functions
+
+  uint32_t NumSupportedHardwareWatchpoints() override;
+
+  uint32_t SetHardwareWatchpoint(lldb::addr_t addr, size_t size,
+                                 uint32_t watch_flags) override;
+
+  bool ClearHardwareWatchpoint(uint32_t hw_index) override;
+
+  Status GetWatchpointHitIndex(uint32_t &wp_index,
+                               lldb::addr_t trap_addr) override;
+
+  lldb::addr_t GetWatchpointHitAddress(uint32_t wp_index) override;
+
+  lldb::addr_t GetWatchpointAddress(uint32_t wp_index) override;
+
+  uint32_t GetWatchpointSize(uint32_t wp_index);
+
+  bool WatchpointIsEnabled(uint32_t wp_index);
+
+protected:
+  bool IsVMX(unsigned reg);
+
+  bool IsVSX(unsigned reg);
+
+  Status ReadVMX();
+
+  Status WriteVMX();
+
+  Status ReadVSX();
+
+  Status WriteVSX();
+
+  void *GetGPRBuffer() override { return &m_gpr_ppc64le; }
+
+  void *GetFPRBuffer() override { return &m_fpr_ppc64le; }
+
+  size_t GetFPRSize() override { return sizeof(m_fpr_ppc64le); }
+
+private:
+  GPR m_gpr_ppc64le; // 64-bit general purpose registers.
+  FPR m_fpr_ppc64le; // floating-point registers including extended register.
+  VMX m_vmx_ppc64le; // VMX registers.
+  VSX m_vsx_ppc64le; // Last lower bytes from first VSX registers.
+
+  bool IsGPR(unsigned reg) const;
+
+  bool IsFPR(unsigned reg) const;
+
+  bool IsVMX(unsigned reg) const;
+
+  bool IsVSX(unsigned reg) const;
+
+  uint32_t CalculateFprOffset(const RegisterInfo *reg_info) const;
+
+  uint32_t CalculateVmxOffset(const RegisterInfo *reg_info) const;
+
+  uint32_t CalculateVsxOffset(const RegisterInfo *reg_info) const;
+
+  Status ReadHardwareDebugInfo();
+
+  Status WriteHardwareDebugRegs();
+
+  // Debug register info for hardware watchpoints management.
+  struct DREG {
+    lldb::addr_t address;   // Breakpoint/watchpoint address value.
+    lldb::addr_t hit_addr;  // Address at which last watchpoint trigger
+                            // exception occurred.
+    lldb::addr_t real_addr; // Address value that should cause target to stop.
+    uint32_t control;       // Breakpoint/watchpoint control value.
+    uint32_t refcount;      // Serves as enable/disable and reference counter.
+    long slot;              // Saves the value returned from PTRACE_SETHWDEBUG.
+    int mode;               // Defines if watchpoint is read/write/access.
+  };
+
+  std::array<DREG, 4> m_hwp_regs;
+
+  // 16 is just a maximum value, query hardware for actual watchpoint count
+  uint32_t m_max_hwp_supported = 16;
+  uint32_t m_max_hbp_supported = 16;
+  bool m_refresh_hwdebug_info = true;
+};
+
+} // namespace process_aix
+} // namespace lldb_private
+
+#endif // #ifndef lldb_NativeRegisterContextAIX_ppc64_h
+
+#endif // defined(__powerpc64__)
diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
new file mode 100644
index 0000000000000..e07daccdff550
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
@@ -0,0 +1,526 @@
+//===-- NativeThreadAIX.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "NativeThreadAIX.h"
+
+#include <csignal>
+#include <sstream>
+
+#include "NativeProcessAIX.h"
+#include "NativeRegisterContextAIX.h"
+
+#include "lldb/Host/HostNativeThread.h"
+#include "lldb/Utility/LLDBAssert.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/State.h"
+#include "lldb/lldb-enumerations.h"
+
+#include "llvm/ADT/SmallString.h"
+
+#include "Plugins/Process/POSIX/CrashReason.h"
+
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <signal.h>
+
+#if 0
+#include <sys/syscall.h>
+// Try to define a macro to encapsulate the tgkill syscall
+#define tgkill(pid, tid, sig)                                                  \
+  syscall(__NR_tgkill, static_cast<::pid_t>(pid), static_cast<::pid_t>(tid),   \
+          sig)
+#endif
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::process_aix;
+
+namespace {
+void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info,
+                       const char *const header) {
+  switch (stop_info.reason) {
+  case eStopReasonNone:
+    log.Printf("%s: %s no stop reason", __FUNCTION__, header);
+    return;
+  case eStopReasonTrace:
+    log.Printf("%s: %s trace, stopping signal 0x%" PRIx32, __FUNCTION__, header,
+               stop_info.signo);
+    return;
+  case eStopReasonBreakpoint:
+    log.Printf("%s: %s breakpoint, stopping signal 0x%" PRIx32, __FUNCTION__,
+               header, stop_info.signo);
+    return;
+  case eStopReasonWatchpoint:
+    log.Printf("%s: %s watchpoint, stopping signal 0x%" PRIx32, __FUNCTION__,
+               header, stop_info.signo);
+    return;
+  case eStopReasonSignal:
+    log.Printf("%s: %s signal 0x%02" PRIx32, __FUNCTION__, header,
+               stop_info.signo);
+    return;
+  case eStopReasonException:
+    log.Printf("%s: %s exception type 0x%02" PRIx64, __FUNCTION__, header,
+               stop_info.details.exception.type);
+    return;
+  case eStopReasonExec:
+    log.Printf("%s: %s exec, stopping signal 0x%" PRIx32, __FUNCTION__, header,
+               stop_info.signo);
+    return;
+  case eStopReasonPlanComplete:
+    log.Printf("%s: %s plan complete", __FUNCTION__, header);
+    return;
+  case eStopReasonThreadExiting:
+    log.Printf("%s: %s thread exiting", __FUNCTION__, header);
+    return;
+  case eStopReasonInstrumentation:
+    log.Printf("%s: %s instrumentation", __FUNCTION__, header);
+    return;
+  case eStopReasonProcessorTrace:
+    log.Printf("%s: %s processor trace", __FUNCTION__, header);
+    return;
+  default:
+    log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header,
+               static_cast<uint32_t>(stop_info.reason));
+  }
+}
+}
+
+NativeThreadAIX::NativeThreadAIX(NativeProcessAIX &process,
+                                     lldb::tid_t tid)
+    : NativeThreadProtocol(process, tid), m_state(StateType::eStateInvalid),
+      m_stop_info(),
+      m_reg_context_up(
+          NativeRegisterContextAIX::CreateHostNativeRegisterContextAIX(
+              process.GetArchitecture(), *this)),
+      m_stop_description() {}
+
+std::string NativeThreadAIX::GetName() {
+  NativeProcessAIX &process = GetProcess();
+
+  auto BufferOrError = getProcFile(process.GetID(), GetID(), "comm");
+  if (!BufferOrError)
+    return "";
+  return std::string(BufferOrError.get()->getBuffer().rtrim('\n'));
+}
+
+lldb::StateType NativeThreadAIX::GetState() { return m_state; }
+
+bool NativeThreadAIX::GetStopReason(ThreadStopInfo &stop_info,
+                                      std::string &description) {
+  Log *log = GetLog(LLDBLog::Thread);
+
+  description.clear();
+
+  switch (m_state) {
+  case eStateStopped:
+  case eStateCrashed:
+  case eStateExited:
+  case eStateSuspended:
+  case eStateUnloaded:
+    if (log)
+      LogThreadStopInfo(*log, m_stop_info, "m_stop_info in thread:");
+    stop_info = m_stop_info;
+    description = m_stop_description;
+    if (log)
+      LogThreadStopInfo(*log, stop_info, "returned stop_info:");
+
+    return true;
+
+  case eStateInvalid:
+  case eStateConnected:
+  case eStateAttaching:
+  case eStateLaunching:
+  case eStateRunning:
+  case eStateStepping:
+  case eStateDetached:
+    if (log) {
+      LLDB_LOGF(log,
+                "NativeThreadAIX::%s tid %" PRIu64
+                " in state %s cannot answer stop reason",
+                __FUNCTION__, GetID(), StateAsCString(m_state));
+    }
+    return false;
+  }
+  llvm_unreachable("unhandled StateType!");
+}
+
+Status NativeThreadAIX::SetWatchpoint(lldb::addr_t addr, size_t size,
+                                        uint32_t watch_flags, bool hardware) {
+  if (!hardware)
+    return Status("not implemented");
+  if (m_state == eStateLaunching)
+    return Status();
+  Status error = RemoveWatchpoint(addr);
+  if (error.Fail())
+    return error;
+  uint32_t wp_index =
+      m_reg_context_up->SetHardwareWatchpoint(addr, size, watch_flags);
+  if (wp_index == LLDB_INVALID_INDEX32)
+    return Status("Setting hardware watchpoint failed.");
+  m_watchpoint_index_map.insert({addr, wp_index});
+  return Status();
+}
+
+Status NativeThreadAIX::RemoveWatchpoint(lldb::addr_t addr) {
+  auto wp = m_watchpoint_index_map.find(addr);
+  if (wp == m_watchpoint_index_map.end())
+    return Status();
+  uint32_t wp_index = wp->second;
+  m_watchpoint_index_map.erase(wp);
+  if (m_reg_context_up->ClearHardwareWatchpoint(wp_index))
+    return Status();
+  return Status("Clearing hardware watchpoint failed.");
+}
+
+Status NativeThreadAIX::SetHardwareBreakpoint(lldb::addr_t addr,
+                                                size_t size) {
+  if (m_state == eStateLaunching)
+    return Status();
+
+  Status error = RemoveHardwareBreakpoint(addr);
+  if (error.Fail())
+    return error;
+
+  uint32_t bp_index = m_reg_context_up->SetHardwareBreakpoint(addr, size);
+
+  if (bp_index == LLDB_INVALID_INDEX32)
+    return Status("Setting hardware breakpoint failed.");
+
+  m_hw_break_index_map.insert({addr, bp_index});
+  return Status();
+}
+
+Status NativeThreadAIX::RemoveHardwareBreakpoint(lldb::addr_t addr) {
+  auto bp = m_hw_break_index_map.find(addr);
+  if (bp == m_hw_break_index_map.end())
+    return Status();
+
+  uint32_t bp_index = bp->second;
+  if (m_reg_context_up->ClearHardwareBreakpoint(bp_index)) {
+    m_hw_break_index_map.erase(bp);
+    return Status();
+  }
+
+  return Status("Clearing hardware breakpoint failed.");
+}
+
+Status NativeThreadAIX::Resume(uint32_t signo) {
+  const StateType new_state = StateType::eStateRunning;
+  MaybeLogStateChange(new_state);
+  m_state = new_state;
+
+  m_stop_info.reason = StopReason::eStopReasonNone;
+  m_stop_description.clear();
+
+  // If watchpoints have been set, but none on this thread, then this is a new
+  // thread. So set all existing watchpoints.
+  if (m_watchpoint_index_map.empty()) {
+    NativeProcessAIX &process = GetProcess();
+
+    const auto &watchpoint_map = process.GetWatchpointMap();
+    m_reg_context_up->ClearAllHardwareWatchpoints();
+    for (const auto &pair : watchpoint_map) {
+      const auto &wp = pair.second;
+      SetWatchpoint(wp.m_addr, wp.m_size, wp.m_watch_flags, wp.m_hardware);
+    }
+  }
+
+  // Set all active hardware breakpoint on all threads.
+  if (m_hw_break_index_map.empty()) {
+    NativeProcessAIX &process = GetProcess();
+
+    const auto &hw_breakpoint_map = process.GetHardwareBreakpointMap();
+    m_reg_context_up->ClearAllHardwareBreakpoints();
+    for (const auto &pair : hw_breakpoint_map) {
+      const auto &bp = pair.second;
+      SetHardwareBreakpoint(bp.m_addr, bp.m_size);
+    }
+  }
+
+  intptr_t data = 0;
+
+  if (signo != LLDB_INVALID_SIGNAL_NUMBER)
+    data = signo;
+
+  return NativeProcessAIX::PtraceWrapper(PT_CONTINUE, GetID(), nullptr,
+                                           reinterpret_cast<void *>(data));
+}
+
+Status NativeThreadAIX::SingleStep(uint32_t signo) {
+  const StateType new_state = StateType::eStateStepping;
+  MaybeLogStateChange(new_state);
+  m_state = new_state;
+  m_stop_info.reason = StopReason::eStopReasonNone;
+
+  intptr_t data = 0;
+  if (signo != LLDB_INVALID_SIGNAL_NUMBER)
+    data = signo;
+
+  // If hardware single-stepping is not supported, we just do a continue. The
+  // breakpoint on the next instruction has been setup in
+  // NativeProcessAIX::Resume.
+  return NativeProcessAIX::PtraceWrapper(
+      GetProcess().SupportHardwareSingleStepping() ? PT_STEP : PT_CONTINUE,
+      m_tid, nullptr, reinterpret_cast<void *>(data));
+}
+
+void NativeThreadAIX::SetStoppedBySignal(uint32_t signo,
+                                         const siginfo_t *info) {
+  Log *log = GetLog(LLDBLog::Thread);
+  LLDB_LOGF(log, "NativeThreadAIX::%s called with signal 0x%02" PRIx32,
+            __FUNCTION__, signo);
+
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonSignal;
+  m_stop_info.signo = signo;
+
+  m_stop_description.clear();
+  switch (signo) {
+  case SIGSEGV:
+  case SIGBUS:
+  case SIGFPE:
+  case SIGILL:
+    break;
+  }
+}
+
+void NativeThreadAIX::AnnotateSyncTagCheckFault(const siginfo_t *info) {
+  int32_t allocation_tag_type = 0;
+  switch (GetProcess().GetArchitecture().GetMachine()) {
+  default:
+    return;
+  }
+
+  auto details =
+      GetRegisterContext().GetMemoryTaggingDetails(allocation_tag_type);
+  if (!details) {
+    llvm::consumeError(details.takeError());
+    return;
+  }
+
+  // We assume that the stop description is currently:
+  // signal SIGSEGV: sync tag check fault (fault address: <addr>)
+  // Remove the closing )
+  m_stop_description.pop_back();
+
+  std::stringstream ss;
+  lldb::addr_t fault_addr = reinterpret_cast<uintptr_t>(info->si_addr);
+  std::unique_ptr<MemoryTagManager> manager(std::move(details->manager));
+
+  ss << " logical tag: 0x" << std::hex << manager->GetLogicalTag(fault_addr);
+
+  std::vector<uint8_t> allocation_tag_data;
+  // The fault address may not be granule aligned. ReadMemoryTags will granule
+  // align any range you give it, potentially making it larger.
+  // To prevent this set len to 1. This always results in a range that is at
+  // most 1 granule in size and includes fault_addr.
+  Status status = GetProcess().ReadMemoryTags(allocation_tag_type, fault_addr,
+                                              1, allocation_tag_data);
+
+  if (status.Success()) {
+    llvm::Expected<std::vector<lldb::addr_t>> allocation_tag =
+        manager->UnpackTagsData(allocation_tag_data, 1);
+    if (allocation_tag) {
+      ss << " allocation tag: 0x" << std::hex << allocation_tag->front() << ")";
+    } else {
+      llvm::consumeError(allocation_tag.takeError());
+      ss << ")";
+    }
+  } else
+    ss << ")";
+
+  m_stop_description += ss.str();
+}
+
+bool NativeThreadAIX::IsStopped(int *signo) {
+  if (!StateIsStoppedState(m_state, false))
+    return false;
+
+  // If we are stopped by a signal, return the signo.
+  if (signo && m_state == StateType::eStateStopped &&
+      m_stop_info.reason == StopReason::eStopReasonSignal) {
+    *signo = m_stop_info.signo;
+  }
+
+  // Regardless, we are stopped.
+  return true;
+}
+
+void NativeThreadAIX::SetStopped() {
+  // On every stop, clear any cached register data structures
+  GetRegisterContext().InvalidateAllRegisters();
+
+  const StateType new_state = StateType::eStateStopped;
+  MaybeLogStateChange(new_state);
+  m_state = new_state;
+  m_stop_description.clear();
+}
+
+void NativeThreadAIX::SetStoppedByExec() {
+  Log *log = GetLog(LLDBLog::Thread);
+  LLDB_LOGF(log, "NativeThreadAIX::%s()", __FUNCTION__);
+
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonExec;
+  m_stop_info.signo = SIGSTOP;
+}
+
+void NativeThreadAIX::SetStoppedByBreakpoint() {
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonBreakpoint;
+  m_stop_info.signo = SIGTRAP;
+  m_stop_description.clear();
+}
+
+void NativeThreadAIX::SetStoppedByWatchpoint(uint32_t wp_index) {
+  SetStopped();
+
+  lldbassert(wp_index != LLDB_INVALID_INDEX32 && "wp_index cannot be invalid");
+
+  std::ostringstream ostr;
+  ostr << m_reg_context_up->GetWatchpointAddress(wp_index) << " ";
+  ostr << wp_index;
+
+  /*
+   * MIPS: Last 3bits of the watchpoint address are masked by the kernel. For
+   * example:
+   * 'n' is at 0x120010d00 and 'm' is 0x120010d04. When a watchpoint is set at
+   * 'm', then
+   * watch exception is generated even when 'n' is read/written. To handle this
+   * case,
+   * find the base address of the load/store instruction and append it in the
+   * stop-info
+   * packet.
+  */
+  ostr << " " << m_reg_context_up->GetWatchpointHitAddress(wp_index);
+
+  m_stop_description = ostr.str();
+
+  m_stop_info.reason = StopReason::eStopReasonWatchpoint;
+  m_stop_info.signo = SIGTRAP;
+}
+
+bool NativeThreadAIX::IsStoppedAtBreakpoint() {
+  return GetState() == StateType::eStateStopped &&
+         m_stop_info.reason == StopReason::eStopReasonBreakpoint;
+}
+
+bool NativeThreadAIX::IsStoppedAtWatchpoint() {
+  return GetState() == StateType::eStateStopped &&
+         m_stop_info.reason == StopReason::eStopReasonWatchpoint;
+}
+
+void NativeThreadAIX::SetStoppedByTrace() {
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonTrace;
+  m_stop_info.signo = SIGTRAP;
+}
+
+void NativeThreadAIX::SetStoppedByFork(bool is_vfork, lldb::pid_t child_pid) {
+  SetStopped();
+
+  m_stop_info.reason =
+      is_vfork ? StopReason::eStopReasonVFork : StopReason::eStopReasonFork;
+  m_stop_info.details.fork.child_pid = child_pid;
+  m_stop_info.details.fork.child_tid = child_pid;
+}
+
+void NativeThreadAIX::SetStoppedByVForkDone() {
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonVForkDone;
+}
+
+void NativeThreadAIX::SetStoppedWithNoReason() {
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonNone;
+  m_stop_info.signo = 0;
+}
+
+void NativeThreadAIX::SetStoppedByProcessorTrace(
+    llvm::StringRef description) {
+  SetStopped();
+
+  m_stop_info.reason = StopReason::eStopReasonProcessorTrace;
+  m_stop_info.signo = 0;
+  m_stop_description = description.str();
+}
+
+void NativeThreadAIX::SetExited() {
+  const StateType new_state = StateType::eStateExited;
+  MaybeLogStateChange(new_state);
+  m_state = new_state;
+
+  m_stop_info.reason = StopReason::eStopReasonThreadExiting;
+}
+
+Status NativeThreadAIX::RequestStop() {
+  Log *log = GetLog(LLDBLog::Thread);
+
+  NativeProcessAIX &process = GetProcess();
+
+  lldb::pid_t pid = process.GetID();
+  lldb::tid_t tid = GetID();
+
+  LLDB_LOGF(log,
+            "NativeThreadAIX::%s requesting thread stop(pid: %" PRIu64
+            ", tid: %" PRIu64 ")",
+            __FUNCTION__, pid, tid);
+
+  Status err;
+  errno = 0;
+  if (::kill(pid, SIGSTOP) != 0) {
+    err.SetErrorToErrno();
+    LLDB_LOGF(log,
+              "NativeThreadAIX::%s kill(%" PRIu64 ", SIGSTOP) failed: %s",
+              __FUNCTION__, pid, err.AsCString());
+  }
+  return err;
+}
+
+void NativeThreadAIX::MaybeLogStateChange(lldb::StateType new_state) {
+  Log *log = GetLog(LLDBLog::Thread);
+  // If we're not logging, we're done.
+  if (!log)
+    return;
+
+  // If this is a state change to the same state, we're done.
+  lldb::StateType old_state = m_state;
+  if (new_state == old_state)
+    return;
+
+  LLDB_LOG(log, "pid={0}, tid={1}: changing from state {2} to {3}",
+           m_process.GetID(), GetID(), old_state, new_state);
+}
+
+NativeProcessAIX &NativeThreadAIX::GetProcess() {
+  return static_cast<NativeProcessAIX &>(m_process);
+}
+
+const NativeProcessAIX &NativeThreadAIX::GetProcess() const {
+  return static_cast<const NativeProcessAIX &>(m_process);
+}
+
+llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+NativeThreadAIX::GetSiginfo() const {
+  auto siginfo_buf =
+      llvm::WritableMemoryBuffer::getNewUninitMemBuffer(sizeof(siginfo_t));
+#if 0
+  Status error =
+      GetProcess().GetSignalInfo(GetID(), siginfo_buf->getBufferStart());
+  if (!error.Success())
+    return error.ToError();
+#endif
+  return std::move(siginfo_buf);
+}
diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h
new file mode 100644
index 0000000000000..706a7ce69da8e
--- /dev/null
+++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.h
@@ -0,0 +1,126 @@
+//===-- NativeThreadAIX.h ----------------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef liblldb_NativeThreadAIX_H_
+#define liblldb_NativeThreadAIX_H_
+
+#include "Plugins/Process/AIX/NativeRegisterContextAIX.h"
+#include "lldb/Host/common/NativeThreadProtocol.h"
+#include "lldb/lldb-private-forward.h"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <csignal>
+#include <map>
+#include <memory>
+#include <string>
+
+namespace lldb_private {
+namespace process_aix {
+
+class NativeProcessAIX;
+
+class NativeThreadAIX : public NativeThreadProtocol {
+  friend class NativeProcessAIX;
+
+public:
+  NativeThreadAIX(NativeProcessAIX &process, lldb::tid_t tid);
+
+  // NativeThreadProtocol Interface
+  std::string GetName() override;
+
+  lldb::StateType GetState() override;
+
+  bool GetStopReason(ThreadStopInfo &stop_info,
+                     std::string &description) override;
+
+  NativeRegisterContextAIX &GetRegisterContext() override {
+    return *m_reg_context_up;
+  }
+
+  Status SetWatchpoint(lldb::addr_t addr, size_t size, uint32_t watch_flags,
+                       bool hardware) override;
+
+  Status RemoveWatchpoint(lldb::addr_t addr) override;
+
+  Status SetHardwareBreakpoint(lldb::addr_t addr, size_t size) override;
+
+  Status RemoveHardwareBreakpoint(lldb::addr_t addr) override;
+
+  NativeProcessAIX &GetProcess();
+
+  const NativeProcessAIX &GetProcess() const;
+
+  llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>>
+  GetSiginfo() const override;
+
+private:
+  // Interface for friend classes
+
+  /// Resumes the thread.  If \p signo is anything but
+  /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
+  Status Resume(uint32_t signo);
+
+  /// Single steps the thread.  If \p signo is anything but
+  /// LLDB_INVALID_SIGNAL_NUMBER, deliver that signal to the thread.
+  Status SingleStep(uint32_t signo);
+
+  void SetStoppedBySignal(uint32_t signo, const siginfo_t *info = nullptr);
+
+  /// Return true if the thread is stopped.
+  /// If stopped by a signal, indicate the signo in the signo argument.
+  /// Otherwise, return LLDB_INVALID_SIGNAL_NUMBER.
+  bool IsStopped(int *signo);
+
+  void SetStoppedByExec();
+
+  void SetStoppedByBreakpoint();
+
+  void SetStoppedByWatchpoint(uint32_t wp_index);
+
+  bool IsStoppedAtBreakpoint();
+
+  bool IsStoppedAtWatchpoint();
+
+  void SetStoppedByTrace();
+
+  void SetStoppedByFork(bool is_vfork, lldb::pid_t child_pid);
+
+  void SetStoppedByVForkDone();
+
+  void SetStoppedWithNoReason();
+
+  void SetStoppedByProcessorTrace(llvm::StringRef description);
+
+  void SetExited();
+
+  Status RequestStop();
+
+  // Private interface
+  void MaybeLogStateChange(lldb::StateType new_state);
+
+  void SetStopped();
+
+  /// Extend m_stop_description with logical and allocation tag values.
+  /// If there is an error along the way just add the information we were able
+  /// to get.
+  void AnnotateSyncTagCheckFault(const siginfo_t *info);
+
+  // Member Variables
+  lldb::StateType m_state;
+  ThreadStopInfo m_stop_info;
+  std::unique_ptr<NativeRegisterContextAIX> m_reg_context_up;
+  std::string m_stop_description;
+  using WatchpointIndexMap = std::map<lldb::addr_t, uint32_t>;
+  WatchpointIndexMap m_watchpoint_index_map;
+  WatchpointIndexMap m_hw_break_index_map;
+};
+} // namespace process_aix
+} // namespace lldb_private
+
+#endif // #ifndef liblldb_NativeThreadAIX_H_
diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt
index a51d0f7afd175..01bb5f462eba4 100644
--- a/lldb/source/Plugins/Process/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/CMakeLists.txt
@@ -7,6 +7,9 @@ elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 elseif (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
   add_subdirectory(NetBSD)
   add_subdirectory(POSIX)
+elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+  add_subdirectory(AIX)
+  add_subdirectory(POSIX)
 elseif (CMAKE_SYSTEM_NAME MATCHES "Windows")
   add_subdirectory(Windows/Common)
 elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin")
diff --git a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
index 32c71d87c7f58..db271357d792a 100644
--- a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
+++ b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
@@ -46,8 +46,34 @@ bool lldb_private::InferiorCallMmap(Process *process, addr_t &allocated_addr,
   function_options.include_inlines = false;
 
   SymbolContextList sc_list;
+#if !defined(__AIX__)
   process->GetTarget().GetImages().FindFunctions(
       ConstString("mmap"), eFunctionNameTypeFull, function_options, sc_list);
+#else
+  process->GetTarget().GetImages().FindFunctions(
+      ConstString("mmap64"), eFunctionNameTypeFull, function_options, sc_list);
+  SymbolContextList toc_list;
+  process->GetTarget().GetImages().FindSymbolsWithNameAndType(
+      ConstString("TOC"), lldb::eSymbolTypeAny, toc_list);
+
+  AddressRange toc_range;
+  if (sc_list.GetSize() > 0) {
+    SymbolContext sc;
+    if (sc_list.GetContextAtIndex(0, sc)) {
+      for (int i = 0; i < toc_list.GetSize(); ++i) {
+        SymbolContext tocSC;
+        if (toc_list.GetContextAtIndex(i, tocSC)) {
+          if (tocSC.module_sp == sc.module_sp) {
+            if (tocSC.GetAddressRange(eSymbolContextSymbol, 0, false,
+                                   toc_range)) {
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+#endif
   const uint32_t count = sc_list.GetSize();
   if (count > 0) {
     SymbolContext sc;
@@ -96,9 +122,16 @@ bool lldb_private::InferiorCallMmap(Process *process, addr_t &allocated_addr,
         MmapArgList args =
             process->GetTarget().GetPlatform()->GetMmapArgumentList(
                 arch, addr, length, prot_arg, flags, fd, offset);
+#if defined(__AIX__)
+        lldb::ThreadPlanSP call_plan_sp(
+            new ThreadPlanCallFunction(*thread, mmap_range.GetBaseAddress(),
+                                       toc_range.GetBaseAddress(),
+                                       void_ptr_type, args, options));
+#else
         lldb::ThreadPlanSP call_plan_sp(
             new ThreadPlanCallFunction(*thread, mmap_range.GetBaseAddress(),
                                        void_ptr_type, args, options));
+#endif
         if (call_plan_sp) {
           DiagnosticManager diagnostics;
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.cpp
index 159fd2856443c..d9b41d595147f 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.cpp
@@ -23,6 +23,8 @@
 static const lldb_private::RegisterInfo *
 GetRegisterInfoPtr(const lldb_private::ArchSpec &target_arch) {
   switch (target_arch.GetMachine()) {
+  //HH
+  case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
     return g_register_infos_ppc64le;
   default:
@@ -34,6 +36,8 @@ GetRegisterInfoPtr(const lldb_private::ArchSpec &target_arch) {
 static uint32_t
 GetRegisterInfoCount(const lldb_private::ArchSpec &target_arch) {
   switch (target_arch.GetMachine()) {
+  //HitchHike
+  case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
     return static_cast<uint32_t>(sizeof(g_register_infos_ppc64le) /
                                  sizeof(g_register_infos_ppc64le[0]));
diff --git a/lldb/source/Plugins/Process/Utility/ThreadMemory.cpp b/lldb/source/Plugins/Process/Utility/ThreadMemory.cpp
index 89ecc757a68f5..550b53688fd39 100644
--- a/lldb/source/Plugins/Process/Utility/ThreadMemory.cpp
+++ b/lldb/source/Plugins/Process/Utility/ThreadMemory.cpp
@@ -20,7 +20,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-ThreadMemory::ThreadMemory(Process &process, tid_t tid,
+ThreadMemory::ThreadMemory(Process &process, lldb::tid_t tid,
                            const ValueObjectSP &thread_info_valobj_sp)
     : Thread(process, tid), m_backing_thread_sp(),
       m_thread_info_valobj_sp(thread_info_valobj_sp), m_name(), m_queue(),
diff --git a/lldb/source/Plugins/Process/gdb-remote/CMakeLists.txt b/lldb/source/Plugins/Process/gdb-remote/CMakeLists.txt
index 6755999b18185..4eddbb5ec4cfd 100644
--- a/lldb/source/Plugins/Process/gdb-remote/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/gdb-remote/CMakeLists.txt
@@ -6,6 +6,11 @@ lldb_tablegen(ProcessGDBRemotePropertiesEnum.inc -gen-lldb-property-enum-defs
   SOURCE ProcessGDBRemoteProperties.td
   TARGET LLDBPluginProcessGDBRemotePropertiesEnumGen)
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
+endif()
+
 set(LLDB_PLUGINS
   lldbPluginProcessUtility
 )
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 74e392249a94e..b7ecf7a5dc328 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -40,6 +40,10 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/JSON.h"
 
+#if defined(__AIX__)
+#include <sys/ldr.h>
+#endif
+
 #if defined(HAVE_LIBCOMPRESSION)
 #include <compression.h>
 #endif
@@ -1710,6 +1714,32 @@ Status GDBRemoteCommunicationClient::GetMemoryRegionInfo(
   return error;
 }
 
+#if defined(__AIX__)
+Status GDBRemoteCommunicationClient::GetLDXINFO(struct ld_xinfo *info_ptr)
+{
+  Status error;
+
+  char packet[64];
+  const int packet_len = ::snprintf(packet, sizeof(packet), "qLDXINFO");
+  assert(packet_len < (int)sizeof(packet));
+  UNUSED_IF_ASSERT_DISABLED(packet_len);
+  StringExtractorGDBRemote response;
+  if (SendPacketAndWaitForResponse(packet, response) ==
+          PacketResult::Success &&
+      response.GetResponseType() == StringExtractorGDBRemote::eResponse) {
+    llvm::MutableArrayRef<uint8_t> infoData((uint8_t *)info_ptr, sizeof(struct ld_xinfo)*64);
+    size_t got_bytes = response.GetHexBytesAvail(infoData);
+    if (got_bytes != sizeof(struct ld_xinfo)*64) {
+      error.SetErrorString("qLDXINFO ret bad size");
+      return error;
+    }
+  } else {
+    error.SetErrorString("qLDXINFO is not supported");
+  }
+  return error;
+}
+#endif
+
 Status GDBRemoteCommunicationClient::GetQXferMemoryMapRegionInfo(
     lldb::addr_t addr, MemoryRegionInfo &region) {
   Status error = LoadQXferMemoryMap();
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 898d176abc346..520f37ac56716 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -32,6 +32,10 @@
 
 #include "llvm/Support/VersionTuple.h"
 
+#if defined(__AIX__)
+struct ld_xinfo;
+#endif
+
 namespace lldb_private {
 namespace process_gdb_remote {
 
@@ -196,6 +200,9 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   Status GetMemoryRegionInfo(lldb::addr_t addr, MemoryRegionInfo &range_info);
 
   std::optional<uint32_t> GetWatchpointSlotCount();
+#if defined(__AIX__)
+  Status GetLDXINFO(struct ld_xinfo *info_ptr);
+#endif
 
   std::optional<bool> GetWatchpointReportedAfter();
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index a0b08a219ae14..f019062986925 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -48,6 +48,9 @@
 #include "ProcessGDBRemote.h"
 #include "ProcessGDBRemoteLog.h"
 #include "lldb/Utility/StringExtractorGDBRemote.h"
+#if defined(__AIX__)
+#include <sys/ldr.h>
+#endif
 
 using namespace lldb;
 using namespace lldb_private;
@@ -193,6 +196,8 @@ void GDBRemoteCommunicationServerLLGS::RegisterPacketHandlers() {
                                 &GDBRemoteCommunicationServerLLGS::Handle_Z);
   RegisterMemberFunctionHandler(StringExtractorGDBRemote::eServerPacketType_z,
                                 &GDBRemoteCommunicationServerLLGS::Handle_z);
+  RegisterMemberFunctionHandler(StringExtractorGDBRemote::eServerPacketType_qLDXINFO,
+                                &GDBRemoteCommunicationServerLLGS::Handle_qLDXINFO);
   RegisterMemberFunctionHandler(
       StringExtractorGDBRemote::eServerPacketType_QPassSignals,
       &GDBRemoteCommunicationServerLLGS::Handle_QPassSignals);
@@ -3006,6 +3011,29 @@ GDBRemoteCommunicationServerLLGS::Handle_z(StringExtractorGDBRemote &packet) {
   }
 }
 
+GDBRemoteCommunication::PacketResult
+GDBRemoteCommunicationServerLLGS::Handle_qLDXINFO(StringExtractorGDBRemote &packet) {
+  if (!m_current_process ||
+      (m_current_process->GetID() == LLDB_INVALID_PROCESS_ID)) {
+    Log *log = GetLog(LLDBLog::Process);
+    LLDB_LOG(log, "qLDXINFO failed, no process available");
+    return SendErrorResponse(0xff);
+  }
+
+#if defined(__AIX__)
+  // FIXME: buffer size
+  struct ld_xinfo info[64];
+  if (ptrace64(PT_LDXINFO, m_current_process->GetID(), (long long)&(info[0]), sizeof(info), nullptr) != 0) {
+    return SendErrorResponse(0xff);
+  }
+  StreamGDBRemote response;
+  response.PutBytesAsRawHex8(&(info[0]), sizeof(info));
+  return SendPacketNoLock(response.GetString());
+#else
+  return SendErrorResponse(0xff);
+#endif
+}
+
 GDBRemoteCommunication::PacketResult
 GDBRemoteCommunicationServerLLGS::Handle_s(StringExtractorGDBRemote &packet) {
   Log *log = GetLog(LLDBLog::Process | LLDBLog::Thread);
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
index 646b6a102abf6..a464479e178de 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.h
@@ -211,6 +211,8 @@ class GDBRemoteCommunicationServerLLGS
 
   PacketResult Handle_z(StringExtractorGDBRemote &packet);
 
+  PacketResult Handle_qLDXINFO(StringExtractorGDBRemote &packet);
+
   PacketResult Handle_s(StringExtractorGDBRemote &packet);
 
   PacketResult Handle_qXfer(StringExtractorGDBRemote &packet);
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 6f9c2cc1e4b4e..10fbaa2b3c837 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -92,6 +92,10 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 
+#if defined(__AIX__)
+#include <sys/ldr.h>
+#endif
+
 #define DEBUGSERVER_BASENAME "debugserver"
 using namespace lldb;
 using namespace lldb_private;
@@ -1513,7 +1517,7 @@ bool ProcessGDBRemote::DoUpdateThreadList(ThreadList &old_thread_list,
   ThreadList old_thread_list_copy(old_thread_list);
   if (num_thread_ids > 0) {
     for (size_t i = 0; i < num_thread_ids; ++i) {
-      tid_t tid = m_thread_ids[i];
+        lldb::tid_t tid = m_thread_ids[i];
       ThreadSP thread_sp(
           old_thread_list_copy.RemoveThreadByProtocolID(tid, false));
       if (!thread_sp) {
@@ -2945,6 +2949,13 @@ Status ProcessGDBRemote::DoGetMemoryRegionInfo(addr_t load_addr,
   return error;
 }
 
+#if defined(__AIX__)
+Status ProcessGDBRemote::DoGetLDXINFO(struct ld_xinfo *info_ptr) {
+  Status error(m_gdb_comm.GetLDXINFO(info_ptr));
+  return error;
+}
+#endif
+
 std::optional<uint32_t> ProcessGDBRemote::GetWatchpointSlotCount() {
   return m_gdb_comm.GetWatchpointSlotCount();
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 2492795851388..82200fbea21cd 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -37,6 +37,10 @@
 #include "GDBRemoteCommunicationClient.h"
 #include "GDBRemoteRegisterContext.h"
 
+#if defined(__AIX__)
+struct ld_xinfo;
+#endif
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 
@@ -423,6 +427,10 @@ class ProcessGDBRemote : public Process,
   Status DoGetMemoryRegionInfo(lldb::addr_t load_addr,
                                MemoryRegionInfo &region_info) override;
 
+#if defined(__AIX__)
+  Status DoGetLDXINFO(struct ld_xinfo *info_ptr) override;
+#endif
+
 private:
   // For ProcessGDBRemote only
   std::string m_partial_profile_data;
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 1da7696c9a352..930c707604bb3 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -593,19 +593,19 @@ bool ProcessMachCore::DoUpdateThreadList(ThreadList &old_thread_list,
     ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
 
     if (core_objfile) {
-      std::set<tid_t> used_tids;
+      std::set<lldb::tid_t> used_tids;
       const uint32_t num_threads = core_objfile->GetNumThreadContexts();
-      std::vector<tid_t> tids;
+      std::vector<lldb::tid_t> tids;
       if (core_objfile->GetCorefileThreadExtraInfos(tids)) {
         assert(tids.size() == num_threads);
 
         // Find highest tid value.
-        tid_t highest_tid = 0;
+        lldb::tid_t highest_tid = 0;
         for (uint32_t i = 0; i < num_threads; i++) {
           if (tids[i] != LLDB_INVALID_THREAD_ID && tids[i] > highest_tid)
             highest_tid = tids[i];
         }
-        tid_t current_unused_tid = highest_tid + 1;
+        lldb::tid_t current_unused_tid = highest_tid + 1;
         for (uint32_t i = 0; i < num_threads; i++) {
           if (tids[i] == LLDB_INVALID_THREAD_ID) {
             tids[i] = current_unused_tid++;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/CMakeLists.txt
index 7523d65abf0f8..1ce60a0b66154 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/CMakeLists.txt
@@ -20,6 +20,11 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_subdirectory(Interfaces)
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
+endif()
+
 
 add_lldb_library(lldbPluginScriptInterpreterPython PLUGIN
   PythonDataObjects.cpp
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index e1f73f1997e36..92882cfc3da31 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -500,6 +500,8 @@ dw_addr_t DWARFFormValue::Address() const {
       &offset, index_size);
 }
 
+bool UGLY_FLAG_FOR_AIX __attribute__((weak)) = false;
+
 std::pair<DWARFUnit *, uint64_t>
 DWARFFormValue::ReferencedUnitAndOffset() const {
   uint64_t value = m_value.value.uval;
@@ -512,6 +514,8 @@ DWARFFormValue::ReferencedUnitAndOffset() const {
     assert(m_unit); // Unit must be valid for DW_FORM_ref forms that are compile
                     // unit relative or we will get this wrong
     value += m_unit->GetOffset();
+    if (UGLY_FLAG_FOR_AIX)
+      value -= 8;
     if (!m_unit->ContainsDIEOffset(value)) {
       m_unit->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
           "DW_FORM_ref* DIE reference {0:x16} is outside of its CU", value);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 66a762bf9b685..6721c1895a576 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -924,6 +924,12 @@ const DWARFDebugAranges &DWARFUnit::GetFunctionAranges() {
   return *m_func_aranges_up;
 }
 
+/* AIX-NOTE - TODO: Removed conflicting code due to merge conflicts
+ * Refer Patches: 27,28,29,30,35 and 76 
+ * and modify the code accordingly. */
+
+bool UGLY_FLAG_FOR_AIX __attribute__((weak)) = false;
+
 llvm::Expected<DWARFUnitSP>
 DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
                    const DWARFDataExtractor &debug_info,
@@ -1002,6 +1008,10 @@ const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
 uint32_t DWARFUnit::GetHeaderByteSize() const {
   switch (m_header.getUnitType()) {
   case llvm::dwarf::DW_UT_compile:
+    if (UGLY_FLAG_FOR_AIX)
+      return 11 + 4/*GetDWARFSizeOfOffset*/;
+    else
+      return GetVersion() < 5 ? 11 : 12;
   case llvm::dwarf::DW_UT_partial:
     return GetVersion() < 5 ? 11 : 12;
   case llvm::dwarf::DW_UT_skeleton:
@@ -1016,7 +1026,7 @@ uint32_t DWARFUnit::GetHeaderByteSize() const {
 
 std::optional<uint64_t>
 DWARFUnit::GetStringOffsetSectionItem(uint32_t index) const {
-  offset_t offset = GetStrOffsetsBase() + index * 4;
+  lldb::offset_t offset = GetStrOffsetsBase() + index * 4;
   return m_dwarf.GetDWARFContext().getOrLoadStrOffsetsData().GetU32(&offset);
 }
 
diff --git a/lldb/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp b/lldb/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
index 2064b73dc3ea5..824528fc3acfa 100644
--- a/lldb/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
+++ b/lldb/source/Plugins/SystemRuntime/MacOSX/AppleGetThreadItemInfoHandler.cpp
@@ -216,7 +216,7 @@ lldb::addr_t AppleGetThreadItemInfoHandler::SetupGetThreadItemInfoFunction(
 
 AppleGetThreadItemInfoHandler::GetThreadItemInfoReturnInfo
 AppleGetThreadItemInfoHandler::GetThreadItemInfo(Thread &thread,
-                                                 tid_t thread_id,
+                                                 lldb::tid_t thread_id,
                                                  addr_t page_to_free,
                                                  uint64_t page_to_free_size,
                                                  Status &error) {
diff --git a/lldb/source/Symbol/DWARFCallFrameInfo.cpp b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
index f3df8a2c27f5a..de244e372579d 100644
--- a/lldb/source/Symbol/DWARFCallFrameInfo.cpp
+++ b/lldb/source/Symbol/DWARFCallFrameInfo.cpp
@@ -33,7 +33,7 @@ using namespace lldb_private::dwarf;
 // Used for calls when the value type is specified by a DWARF EH Frame pointer
 // encoding.
 static uint64_t
-GetGNUEHPointer(const DataExtractor &DE, offset_t *offset_ptr,
+GetGNUEHPointer(const DataExtractor &DE, lldb::offset_t *offset_ptr,
                 uint32_t eh_ptr_enc, addr_t pc_rel_addr, addr_t text_addr,
                 addr_t data_addr) //, BSDRelocs *data_relocs) const
 {
@@ -588,7 +588,7 @@ bool DWARFCallFrameInfo::FDEToUnwindPlan(dw_offset_t dwarf_offset,
   if (cie->augmentation[0] == 'z') {
     uint32_t aug_data_len = (uint32_t)m_cfi_data.GetULEB128(&offset);
     if (aug_data_len != 0 && cie->lsda_addr_encoding != DW_EH_PE_omit) {
-      offset_t saved_offset = offset;
+        lldb::offset_t saved_offset = offset;
       lsda_data_file_address =
           GetGNUEHPointer(m_cfi_data, &offset, cie->lsda_addr_encoding,
                           pc_rel_addr, text_addr, data_addr);
diff --git a/lldb/source/Target/ABI.cpp b/lldb/source/Target/ABI.cpp
index 110b5c86fc425..6df03533cda29 100644
--- a/lldb/source/Target/ABI.cpp
+++ b/lldb/source/Target/ABI.cpp
@@ -208,6 +208,15 @@ bool ABI::PrepareTrivialCall(Thread &thread, lldb::addr_t sp,
   llvm_unreachable("Should never get here!");
 }
 
+bool ABI::PrepareTrivialCall(Thread &thread, lldb::addr_t sp,
+                             lldb::addr_t functionAddress,
+                             lldb::addr_t tocAddress,
+                             lldb::addr_t returnAddress,
+                             llvm::ArrayRef<lldb::addr_t> args) const {
+  // dummy prepare trivial call
+  llvm_unreachable("Should never get here!");
+}
+
 bool ABI::GetFallbackRegisterLocation(
     const RegisterInfo *reg_info,
     UnwindPlan::Row::RegisterLocation &unwind_regloc) {
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index a42c44b761dc5..833489b16dfd7 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
+endif()
+
 lldb_tablegen(TargetProperties.inc -gen-lldb-property-defs
   SOURCE TargetProperties.td
   TARGET LLDBTargetPropertiesGen)
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index e3c4f2ee398cc..e31245178b2f2 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -75,6 +75,10 @@
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/Timer.h"
 
+#if defined(__AIX__)
+#include <sys/ldr.h>
+#endif
+
 using namespace lldb;
 using namespace lldb_private;
 using namespace std::chrono;
@@ -6206,6 +6210,12 @@ Status Process::GetMemoryRegionInfo(lldb::addr_t load_addr,
   return DoGetMemoryRegionInfo(load_addr, range_info);
 }
 
+#if defined(__AIX__)
+Status Process::GetLDXINFO(struct ld_xinfo *info_ptr) {
+  return DoGetLDXINFO(info_ptr);
+}
+#endif
+
 Status Process::GetMemoryRegions(lldb_private::MemoryRegionInfos &region_list) {
   Status error;
 
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index a61228d092d89..57f42ea56cb18 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -40,6 +40,9 @@
 
 #include <cassert>
 #include <memory>
+#ifdef __AIX__
+#include "Plugins/Process/Utility/lldb-ppc64le-register-enums.h"
+#endif
 
 using namespace lldb;
 using namespace lldb_private;
@@ -1257,6 +1260,10 @@ bool RegisterContextUnwind::IsTrapHandlerSymbol(
 // Answer the question: Where did THIS frame save the CALLER frame ("previous"
 // frame)'s register value?
 
+#ifdef __AIX__
+extern bool UGLY_HACK_NULL_TOPFRAME;
+#endif
+
 enum UnwindLLDB::RegisterSearchResult
 RegisterContextUnwind::SavedLocationForRegister(
     uint32_t lldb_regnum, lldb_private::UnwindLLDB::RegisterLocation &regloc) {
@@ -1517,6 +1524,11 @@ RegisterContextUnwind::SavedLocationForRegister(
       new_regloc.type =
           UnwindLLDB::RegisterLocation::eRegisterInLiveRegisterContext;
       new_regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
+#ifdef __AIX__
+      if (UGLY_HACK_NULL_TOPFRAME && new_regloc.location.register_number == 0x20) {
+        new_regloc.location.register_number = 0x24;
+      }
+#endif
       m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;
       regloc = new_regloc;
       UnwindLogMsg("supplying caller's register %s (%d) from the live "
@@ -2368,6 +2380,40 @@ bool RegisterContextUnwind::ReadPC(addr_t &pc) {
   }
 }
 
+#ifdef __AIX__
+bool RegisterContextUnwind::ReadLR(addr_t &lr) {
+  if (!IsValid())
+    return false;
+
+  bool above_trap_handler = false;
+  if (GetNextFrame().get() && GetNextFrame()->IsValid() &&
+      GetNextFrame()->IsTrapHandlerFrame())
+    above_trap_handler = true;
+
+  if (ReadGPRValue(eRegisterKindLLDB, gpr_lr_ppc64le, lr)) {
+    // A lr value of 0 or 1 is impossible in the middle of the stack -- it
+    // indicates the end of a stack walk.
+    // On the currently executing frame (or such a frame interrupted
+    // asynchronously by sigtramp et al) this may occur if code has jumped
+    // through a NULL pointer -- we want to be able to unwind past that frame
+    // to help find the bug.
+
+    ProcessSP process_sp (m_thread.GetProcess());
+    if (process_sp)
+    {
+        ABI *abi = process_sp->GetABI().get();
+        if (abi)
+            lr = abi->FixCodeAddress(lr);
+    }
+
+    return !(m_all_registers_available == false &&
+             above_trap_handler == false && (lr == 0 || lr == 1));
+  } else {
+    return false;
+  }
+}
+#endif
+
 void RegisterContextUnwind::UnwindLogMsg(const char *fmt, ...) {
   Log *log = GetLog(LLDBLog::Unwind);
   if (!log)
diff --git a/lldb/source/Target/ThreadPlanCallFunction.cpp b/lldb/source/Target/ThreadPlanCallFunction.cpp
index 50dcb66b9719f..0926579ea2930 100644
--- a/lldb/source/Target/ThreadPlanCallFunction.cpp
+++ b/lldb/source/Target/ThreadPlanCallFunction.cpp
@@ -127,6 +127,40 @@ ThreadPlanCallFunction::ThreadPlanCallFunction(
   m_valid = true;
 }
 
+ThreadPlanCallFunction::ThreadPlanCallFunction(
+    Thread &thread, const Address &function, const Address &toc,
+    const CompilerType &return_type,
+    llvm::ArrayRef<addr_t> args, const EvaluateExpressionOptions &options)
+    : ThreadPlan(ThreadPlan::eKindCallFunction, "Call function plan", thread,
+                 eVoteNoOpinion, eVoteNoOpinion),
+      m_valid(false), m_stop_other_threads(options.GetStopOthers()),
+      m_unwind_on_error(options.DoesUnwindOnError()),
+      m_ignore_breakpoints(options.DoesIgnoreBreakpoints()),
+      m_debug_execution(options.GetDebug()),
+      m_trap_exceptions(options.GetTrapExceptions()), m_function_addr(function),
+      m_function_sp(0), m_takedown_done(false),
+      m_should_clear_objc_exception_bp(false),
+      m_should_clear_cxx_exception_bp(false),
+      m_stop_address(LLDB_INVALID_ADDRESS), m_return_type(return_type) {
+  lldb::addr_t start_load_addr = LLDB_INVALID_ADDRESS;
+  lldb::addr_t function_load_addr = LLDB_INVALID_ADDRESS;
+  lldb::addr_t toc_addr = LLDB_INVALID_ADDRESS;
+  ABI *abi = nullptr;
+
+  if (!ConstructorSetup(thread, abi, start_load_addr, function_load_addr))
+    return;
+
+  toc_addr = toc.GetLoadAddress(&GetTarget());
+
+  if (!abi->PrepareTrivialCall(thread, m_function_sp, function_load_addr,
+                               toc_addr, start_load_addr, args))
+    return;
+
+  ReportRegisterState("Function call was set up.  Register state was:");
+
+  m_valid = true;
+}
+
 ThreadPlanCallFunction::ThreadPlanCallFunction(
     Thread &thread, const Address &function,
     const EvaluateExpressionOptions &options)
diff --git a/lldb/source/Target/UnwindLLDB.cpp b/lldb/source/Target/UnwindLLDB.cpp
index f43e940492b09..255b829738ba2 100644
--- a/lldb/source/Target/UnwindLLDB.cpp
+++ b/lldb/source/Target/UnwindLLDB.cpp
@@ -68,6 +68,10 @@ uint32_t UnwindLLDB::DoGetFrameCount() {
   return m_frames.size();
 }
 
+#ifdef __AIX__
+bool UGLY_HACK_NULL_TOPFRAME = false;
+#endif
+
 bool UnwindLLDB::AddFirstFrame() {
   if (m_frames.size() > 0)
     return true;
@@ -91,6 +95,17 @@ bool UnwindLLDB::AddFirstFrame() {
   if (!reg_ctx_sp->ReadPC(first_cursor_sp->start_pc))
     goto unwind_done;
 
+#ifdef __AIX__
+  lldb::addr_t lr;
+  if (!reg_ctx_sp->ReadLR(lr))
+    goto unwind_done;
+
+  if (first_cursor_sp->start_pc == 0) {
+    first_cursor_sp->start_pc = lr;
+    UGLY_HACK_NULL_TOPFRAME = true;
+  }
+#endif
+
   // Everything checks out, so release the auto pointer value and let the
   // cursor own it in its shared pointer
   first_cursor_sp->reg_ctx_lldb_sp = reg_ctx_sp;
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 07ef435ef451d..3868f77169cc6 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -14,6 +14,7 @@
 #include "lldb/lldb-defines.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Compiler.h"
@@ -459,10 +460,22 @@ static const ArchDefinition g_coff_arch_def = {
     "pe-coff",
 };
 
+static const ArchDefinitionEntry g_xcoff_arch_entries[] = {
+    {ArchSpec::eCore_ppc_generic, llvm::XCOFF::TCPU_COM, LLDB_INVALID_CPUTYPE, 0xFFFFFFFFu, 0xFFFFFFFFu},
+    {ArchSpec::eCore_ppc64_generic, llvm::XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, 0xFFFFFFFFu, 0xFFFFFFFFu}
+};
+
+static const ArchDefinition g_xcoff_arch_def = {
+    eArchTypeXCOFF,
+    std::size(g_xcoff_arch_entries),
+    g_xcoff_arch_entries,
+    "xcoff",
+};
+
 //===----------------------------------------------------------------------===//
 // Table of all ArchDefinitions
 static const ArchDefinition *g_arch_definitions[] = {
-    &g_macho_arch_def, &g_elf_arch_def, &g_coff_arch_def};
+    &g_macho_arch_def, &g_elf_arch_def, &g_coff_arch_def, &g_xcoff_arch_def};
 
 //===----------------------------------------------------------------------===//
 // Static helper functions.
@@ -903,6 +916,9 @@ bool ArchSpec::SetArchitecture(ArchitectureType arch_type, uint32_t cpu,
         } else if (arch_type == eArchTypeCOFF && os == llvm::Triple::Win32) {
           m_triple.setVendor(llvm::Triple::PC);
           m_triple.setOS(llvm::Triple::Win32);
+        } else if (arch_type == eArchTypeXCOFF && os == llvm::Triple::AIX) {
+          m_triple.setVendor(llvm::Triple::IBM);
+          m_triple.setOS(llvm::Triple::AIX);
         } else {
           m_triple.setVendor(llvm::Triple::UnknownVendor);
           m_triple.setOS(llvm::Triple::UnknownOS);
diff --git a/lldb/source/Utility/StringExtractorGDBRemote.cpp b/lldb/source/Utility/StringExtractorGDBRemote.cpp
index 9f79d2271b1e6..dbd3236536f8c 100644
--- a/lldb/source/Utility/StringExtractorGDBRemote.cpp
+++ b/lldb/source/Utility/StringExtractorGDBRemote.cpp
@@ -227,6 +227,8 @@ StringExtractorGDBRemote::GetServerPacketType() const {
         return eServerPacketType_qLaunchGDBServer;
       if (PACKET_MATCHES("qLaunchSuccess"))
         return eServerPacketType_qLaunchSuccess;
+      if (PACKET_MATCHES("qLDXINFO"))
+        return eServerPacketType_qLDXINFO;
       break;
 
     case 'M':
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 5ac474736eb63..413a1e5120288 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -155,7 +155,7 @@ if(TARGET clang)
   add_lldb_test_dependency(clang)
 
   # TestFullLtoStepping depends on LTO, and only runs when the compiler is clang.
-  add_lldb_test_dependency(LTO)
+  #add_lldb_test_dependency(LTO)
 
   if (TARGET libcxx OR ("libcxx" IN_LIST LLVM_ENABLE_RUNTIMES))
     set(LLDB_HAS_LIBCXX ON)
diff --git a/lldb/test/Shell/Expr/TestIRMemoryMap.test b/lldb/test/Shell/Expr/TestIRMemoryMap.test
index 9dd0413be14cf..5ed61ad33ffc4 100644
--- a/lldb/test/Shell/Expr/TestIRMemoryMap.test
+++ b/lldb/test/Shell/Expr/TestIRMemoryMap.test
@@ -1,6 +1,6 @@
 # UNSUPPORTED: system-windows
 
-# RUN: %clangxx_host %p/Inputs/call-function.cpp -g -o %t
+# RUN: %clangxx_host -std=c++11 %p/Inputs/env.cpp -o %t
 
 # RUN: lldb-test ir-memory-map %t %S/Inputs/ir-memory-map-basic
 # RUN: lldb-test ir-memory-map -host-only %t %S/Inputs/ir-memory-map-basic
diff --git a/lldb/test/Shell/Process/TestEnvironment.test b/lldb/test/Shell/Process/TestEnvironment.test
index e6d6e56fc9203..2ead258719f32 100644
--- a/lldb/test/Shell/Process/TestEnvironment.test
+++ b/lldb/test/Shell/Process/TestEnvironment.test
@@ -3,7 +3,7 @@ UNSUPPORTED: lldb-repro
 
 The double quotes around "BAR" ensure we don't match the command.
 
-RUN: %clangxx_host -std=c++11 %p/Inputs/env.cpp -o %t
+RUN: %clangxx_host -std=c++11 -I/compgpfs/build/xlcit/rings/openxlC/aix/wyvern_dev/ring0/latest/opt/IBM/openxlC/17.1.2/include/c++/v1/ %p/Inputs/env.cpp -o %t
 RUN: %lldb %t -o 'process launch --environment FOO="BAR"' | FileCheck %s
 RUN: %lldb %t -o 'env FOO="BAR"' -o 'process launch' | FileCheck %s
 
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index cd304a047dea6..78617be24f780 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -11,6 +11,11 @@ if(APPLE)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-sectcreate,__TEXT,__info_plist,${CMAKE_CURRENT_BINARY_DIR}/lldb-Info.plist")
 endif()
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
+endif()
+
 add_lldb_tool(lldb
   Driver.cpp
   Platform.cpp
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 14371da64f2f2..f7eaf56738d7d 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -639,7 +639,7 @@ void sigwinch_handler(int signo) {
 }
 
 void sigint_handler(int signo) {
-#ifdef _WIN32 // Restore handler as it is not persistent on Windows
+#if defined(_WIN32) || defined(__AIX__) // Restore handler as it is not persistent on Windows
   signal(SIGINT, sigint_handler);
 #endif
   static std::atomic_flag g_interrupt_sent = ATOMIC_FLAG_INIT;
@@ -727,8 +727,11 @@ static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) {
 
 int main(int argc, char const *argv[]) {
   // Editline uses for example iswprint which is dependent on LC_CTYPE.
+  // FIXME: this caused unexpected SIGTRAP on AIX
+#ifndef __AIX__
   std::setlocale(LC_ALL, "");
   std::setlocale(LC_CTYPE, "");
+#endif
 
   // Setup LLVM signal handlers and make sure we call llvm_shutdown() on
   // destruction.
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index f8f0d86453f58..2fa6f6c9a5369 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -6,6 +6,10 @@ if (HAVE_LIBPTHREAD)
   list(APPEND extra_libs pthread)
 endif ()
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  add_definitions("-D_AIX")
+  add_definitions("-D_ALL_SOURCE")
+endif()
 
 if(APPLE)
   configure_file(
diff --git a/lldb/tools/lldb-server/CMakeLists.txt b/lldb/tools/lldb-server/CMakeLists.txt
index 9030ed709a647..0d69ae32a008f 100644
--- a/lldb/tools/lldb-server/CMakeLists.txt
+++ b/lldb/tools/lldb-server/CMakeLists.txt
@@ -8,6 +8,10 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android")
   list(APPEND LLDB_PLUGINS lldbPluginProcessLinux)
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "AIX")
+  list(APPEND LLDB_PLUGINS lldbPluginProcessAIX)
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
   list(APPEND LLDB_PLUGINS lldbPluginProcessFreeBSD)
 endif()
@@ -20,6 +24,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   list(APPEND LLDB_PLUGINS lldbPluginObjectFileMachO)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
   list(APPEND LLDB_PLUGINS lldbPluginObjectFilePECOFF)
+elseif(CMAKE_SYSTEM_NAME MATCHES "AIX")
+  list(APPEND LLDB_PLUGINS lldbPluginObjectFileXCOFF)
 else()
   list(APPEND LLDB_PLUGINS lldbPluginObjectFileELF)
 endif()
@@ -54,6 +60,7 @@ add_lldb_tool(lldb-server
       lldbPluginInstructionMIPS
       lldbPluginInstructionMIPS64
       lldbPluginInstructionRISCV
+      lldbPluginInstructionPPC64
       ${LLDB_SYSTEM_LIBS}
 
     LINK_COMPONENTS
diff --git a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
index 4233252a84dfc..91bb2083a88b5 100644
--- a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
+++ b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
@@ -14,6 +14,9 @@ using HostObjectFile = ObjectFileMachO;
 #elif defined(_WIN32)
 #include "Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h"
 using HostObjectFile = ObjectFilePECOFF;
+#elif defined(__AIX__)
+#include "Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h"
+using HostObjectFile = ObjectFileXCOFF;
 #else
 #include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
 using HostObjectFile = ObjectFileELF;
@@ -46,6 +49,10 @@ using HostObjectFile = ObjectFileELF;
 #include "Plugins/Instruction/MIPS/EmulateInstructionMIPS.h"
 #endif
 
+#if defined(__AIX__)
+#include "Plugins/Instruction/PPC64/EmulateInstructionPPC64.h"
+#endif
+
 #if defined(__riscv)
 #define LLDB_TARGET_RISCV
 #include "Plugins/Instruction/RISCV/EmulateInstructionRISCV.h"
@@ -75,6 +82,10 @@ llvm::Error SystemInitializerLLGS::Initialize() {
   EmulateInstructionRISCV::Initialize();
 #endif
 
+#if defined(__AIX__)
+  EmulateInstructionPPC64::Initialize();
+#endif
+
   return llvm::Error::success();
 }
 
@@ -97,5 +108,9 @@ void SystemInitializerLLGS::Terminate() {
   EmulateInstructionRISCV::Terminate();
 #endif
 
+#if defined(__AIX__)
+  EmulateInstructionPPC64::Terminate();
+#endif
+
   SystemInitializerCommon::Terminate();
 }
diff --git a/lldb/tools/lldb-server/lldb-gdbserver.cpp b/lldb/tools/lldb-server/lldb-gdbserver.cpp
index 563284730bc70..2a14f4f9c82aa 100644
--- a/lldb/tools/lldb-server/lldb-gdbserver.cpp
+++ b/lldb/tools/lldb-server/lldb-gdbserver.cpp
@@ -45,6 +45,8 @@
 #include "Plugins/Process/NetBSD/NativeProcessNetBSD.h"
 #elif defined(_WIN32)
 #include "Plugins/Process/Windows/Common/NativeProcessWindows.h"
+#elif defined(__AIX__)
+#include "Plugins/Process/AIX/NativeProcessAIX.h"
 #endif
 
 #ifndef LLGS_PROGRAM_NAME
@@ -70,6 +72,8 @@ typedef process_freebsd::NativeProcessFreeBSD::Manager NativeProcessManager;
 typedef process_netbsd::NativeProcessNetBSD::Manager NativeProcessManager;
 #elif defined(_WIN32)
 typedef NativeProcessWindows::Manager NativeProcessManager;
+#elif defined(__AIX__)
+typedef process_aix::NativeProcessAIX::Manager NativeProcessManager;
 #else
 // Dummy implementation to make sure the code compiles
 class NativeProcessManager : public NativeProcessProtocol::Manager {
diff --git a/lldb/unittests/Host/FileSystemTest.cpp b/lldb/unittests/Host/FileSystemTest.cpp
index 58887f6b2467e..89d0f5b87171a 100644
--- a/lldb/unittests/Host/FileSystemTest.cpp
+++ b/lldb/unittests/Host/FileSystemTest.cpp
@@ -59,7 +59,7 @@ class DummyFileSystem : public vfs::FileSystem {
     return I->second;
   }
   ErrorOr<std::unique_ptr<vfs::File>>
-  openFileForRead(const Twine &Path) override {
+  openFileForRead(const Twine &Path, bool IsText) override {
     auto S = status(Path);
     if (S)
       return std::unique_ptr<vfs::File>(new DummyFile{*S});
diff --git a/lldb/unittests/Host/posix/TerminalTest.cpp b/lldb/unittests/Host/posix/TerminalTest.cpp
index 5187a0c20a68b..f3de92c0852b1 100644
--- a/lldb/unittests/Host/posix/TerminalTest.cpp
+++ b/lldb/unittests/Host/posix/TerminalTest.cpp
@@ -94,15 +94,19 @@ TEST_F(TerminalTest, SetRaw) {
 TEST_F(TerminalTest, SetBaudRate) {
   struct termios terminfo;
 
+#if (defined(__AIX__) && defined(B38400)) || !defined(__AIX__)
   ASSERT_THAT_ERROR(m_term.SetBaudRate(38400), llvm::Succeeded());
   ASSERT_EQ(tcgetattr(m_fd, &terminfo), 0);
   EXPECT_EQ(cfgetispeed(&terminfo), static_cast<speed_t>(B38400));
   EXPECT_EQ(cfgetospeed(&terminfo), static_cast<speed_t>(B38400));
+#endif
 
+#if (defined(__AIX__) && defined(B115200)) || !defined(__AIX__)
   ASSERT_THAT_ERROR(m_term.SetBaudRate(115200), llvm::Succeeded());
   ASSERT_EQ(tcgetattr(m_fd, &terminfo), 0);
   EXPECT_EQ(cfgetispeed(&terminfo), static_cast<speed_t>(B115200));
   EXPECT_EQ(cfgetospeed(&terminfo), static_cast<speed_t>(B115200));
+#endif
 
   // uncommon value
 #if defined(B153600)
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index 5a7cd8e38f2b7..fa9c6781e24f5 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -542,7 +542,6 @@ class XCOFFObjectFile : public ObjectFile {
   template <typename T> const T *sectionHeaderTable() const;
 
   size_t getFileHeaderSize() const;
-  size_t getSectionHeaderSize() const;
 
   const XCOFFSectionHeader32 *toSection32(DataRefImpl Ref) const;
   const XCOFFSectionHeader64 *toSection64(DataRefImpl Ref) const;
@@ -578,6 +577,9 @@ class XCOFFObjectFile : public ObjectFile {
   void checkSectionAddress(uintptr_t Addr, uintptr_t TableAddr) const;
 
 public:
+  size_t getSectionHeaderSize() const;
+  Expected<uintptr_t> getLoaderSectionAddress() const;
+
   static constexpr uint64_t InvalidRelocOffset =
       std::numeric_limits<uint64_t>::max();
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index bdd04b00f557b..9c96df1bbdc54 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -251,10 +251,16 @@ Expected<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   return DA.getRelocatedValue(ItemSize, &Offset);
 }
 
+bool UGLY_FLAG_FOR_AIX __attribute__((weak)) = false;
+
 Error DWARFUnitHeader::extract(DWARFContext &Context,
                                const DWARFDataExtractor &debug_info,
                                uint64_t *offset_ptr,
                                DWARFSectionKind SectionKind) {
+  if (UGLY_FLAG_FOR_AIX) {
+    // FIXME: hack to get version
+    *offset_ptr += 8;
+  }
   Offset = *offset_ptr;
   Error Err = Error::success();
   IndexEntry = nullptr;
@@ -267,8 +273,13 @@ Error DWARFUnitHeader::extract(DWARFContext &Context,
     AbbrOffset = debug_info.getRelocatedValue(
         FormParams.getDwarfOffsetByteSize(), offset_ptr, nullptr, &Err);
   } else {
-    AbbrOffset = debug_info.getRelocatedValue(
-        FormParams.getDwarfOffsetByteSize(), offset_ptr, nullptr, &Err);
+    if (UGLY_FLAG_FOR_AIX) {
+        AbbrOffset = debug_info.getRelocatedValue(
+            8, offset_ptr, nullptr, &Err);
+    } else {
+        AbbrOffset = debug_info.getRelocatedValue(
+            FormParams.getDwarfOffsetByteSize(), offset_ptr, nullptr, &Err);
+    }
     FormParams.AddrSize = debug_info.getU8(offset_ptr, &Err);
     // Fake a unit type based on the section type.  This isn't perfect,
     // but distinguishing compile and type units is generally enough.

From b1da5b1cf35829fcbf4ad6564c6005c755012e47 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 7 Aug 2024 12:18:45 -0500
Subject: [PATCH 02/58] Code license notice

---
 lldb/NOTICE.TXT | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 lldb/NOTICE.TXT

diff --git a/lldb/NOTICE.TXT b/lldb/NOTICE.TXT
new file mode 100644
index 0000000000000..d814272967476
--- /dev/null
+++ b/lldb/NOTICE.TXT
@@ -0,0 +1,7 @@
+
+This product contains small piece of code to support AIX, taken from netbsd.
+
+  * LICENSE:
+    * lldb/source/Host/common/LICENSE.aix-netbsd.txt (OpenSSL License)
+  * HOMEPAGE:
+    * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist

From 50ad673a78029fd6c47d90317e2c61ca2b59d5c5 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 7 Aug 2024 13:27:20 -0500
Subject: [PATCH 03/58] Reverting .tests

---
 lldb/test/Shell/Expr/TestIRMemoryMap.test    | 2 +-
 lldb/test/Shell/Process/TestEnvironment.test | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/test/Shell/Expr/TestIRMemoryMap.test b/lldb/test/Shell/Expr/TestIRMemoryMap.test
index 5ed61ad33ffc4..9dd0413be14cf 100644
--- a/lldb/test/Shell/Expr/TestIRMemoryMap.test
+++ b/lldb/test/Shell/Expr/TestIRMemoryMap.test
@@ -1,6 +1,6 @@
 # UNSUPPORTED: system-windows
 
-# RUN: %clangxx_host -std=c++11 %p/Inputs/env.cpp -o %t
+# RUN: %clangxx_host %p/Inputs/call-function.cpp -g -o %t
 
 # RUN: lldb-test ir-memory-map %t %S/Inputs/ir-memory-map-basic
 # RUN: lldb-test ir-memory-map -host-only %t %S/Inputs/ir-memory-map-basic
diff --git a/lldb/test/Shell/Process/TestEnvironment.test b/lldb/test/Shell/Process/TestEnvironment.test
index 2ead258719f32..e6d6e56fc9203 100644
--- a/lldb/test/Shell/Process/TestEnvironment.test
+++ b/lldb/test/Shell/Process/TestEnvironment.test
@@ -3,7 +3,7 @@ UNSUPPORTED: lldb-repro
 
 The double quotes around "BAR" ensure we don't match the command.
 
-RUN: %clangxx_host -std=c++11 -I/compgpfs/build/xlcit/rings/openxlC/aix/wyvern_dev/ring0/latest/opt/IBM/openxlC/17.1.2/include/c++/v1/ %p/Inputs/env.cpp -o %t
+RUN: %clangxx_host -std=c++11 %p/Inputs/env.cpp -o %t
 RUN: %lldb %t -o 'process launch --environment FOO="BAR"' | FileCheck %s
 RUN: %lldb %t -o 'env FOO="BAR"' -o 'process launch' | FileCheck %s
 

From c1967be8fe14d469cb5ae9d41d115a7003ff39b6 Mon Sep 17 00:00:00 2001
From: Lakshmi Surekha Kovvuri <lakshmi@aixbase.aus.stglabs.ibm.com>
Date: Thu, 22 Aug 2024 08:49:50 -0500
Subject: [PATCH 04/58] For TestSuite Run

---
 lldb/unittests/Host/FileSystemTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/unittests/Host/FileSystemTest.cpp b/lldb/unittests/Host/FileSystemTest.cpp
index 89d0f5b87171a..58887f6b2467e 100644
--- a/lldb/unittests/Host/FileSystemTest.cpp
+++ b/lldb/unittests/Host/FileSystemTest.cpp
@@ -59,7 +59,7 @@ class DummyFileSystem : public vfs::FileSystem {
     return I->second;
   }
   ErrorOr<std::unique_ptr<vfs::File>>
-  openFileForRead(const Twine &Path, bool IsText) override {
+  openFileForRead(const Twine &Path) override {
     auto S = status(Path);
     if (S)
       return std::unique_ptr<vfs::File>(new DummyFile{*S});

From 758ab642d0974e799ac902d8ad240a3a90aeb24d Mon Sep 17 00:00:00 2001
From: Lakshmi Surekha Kovvuri <lakshmi@aixbase.aus.stglabs.ibm.com>
Date: Fri, 30 Aug 2024 08:33:32 -0500
Subject: [PATCH 05/58] Changes made to AIX-specific files to eliminate errors
 encountered during CI when updating LLDB.

---
 .../Plugins/Process/AIX/NativeProcessAIX.cpp  | 20 ++++++++++---------
 .../Process/AIX/NativeRegisterContextAIX.cpp  |  4 ++--
 .../AIX/NativeRegisterContextAIX_ppc64.cpp    | 10 +++++-----
 .../Plugins/Process/AIX/NativeThreadAIX.cpp   |  2 +-
 .../GDBRemoteCommunicationClient.cpp          |  4 ++--
 5 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
index 882f20d30a3bf..5b01a66b0453f 100644
--- a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -211,12 +211,14 @@ static Status EnsureFDFlags(int fd, int flags) {
 
   int status = fcntl(fd, F_GETFL);
   if (status == -1) {
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
+    // error.SetErrorToErrno();
     return error;
   }
 
   if (fcntl(fd, F_SETFL, status | flags) == -1) {
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
+    // error.SetErrorToErrno();
     return error;
   }
 
@@ -813,7 +815,7 @@ Status NativeProcessAIX::Resume(const ResumeActionList &resume_actions) {
       Status error = ResumeThread(static_cast<NativeThreadAIX &>(*thread),
                                   action->state, signo);
       if (error.Fail())
-        return Status("NativeProcessAIX::%s: failed to resume thread "
+        return Status::FromErrorStringWithFormat("NativeProcessAIX::%s: failed to resume thread "
                       "for pid %" PRIu64 ", tid %" PRIu64 ", error = %s",
                       __FUNCTION__, GetID(), thread->GetID(),
                       error.AsCString());
@@ -826,7 +828,7 @@ Status NativeProcessAIX::Resume(const ResumeActionList &resume_actions) {
       break;
 
     default:
-      return Status("NativeProcessAIX::%s (): unexpected state %s specified "
+      return Status::FromErrorStringWithFormat("NativeProcessAIX::%s (): unexpected state %s specified "
                     "for pid %" PRIu64 ", tid %" PRIu64,
                     __FUNCTION__, StateAsCString(action->state), GetID(),
                     thread->GetID());
@@ -840,7 +842,7 @@ Status NativeProcessAIX::Halt() {
   Status error;
 
   if (kill(GetID(), SIGSTOP) != 0)
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
 
   return error;
 }
@@ -874,7 +876,7 @@ Status NativeProcessAIX::Signal(int signo) {
            Host::GetSignalAsCString(signo), GetID());
 
   if (kill(GetID(), signo))
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
 
   return error;
 }
@@ -951,7 +953,7 @@ Status NativeProcessAIX::Kill() {
   }
 
   if (kill(GetID(), SIGKILL) != 0) {
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
     return error;
   }
 
@@ -1555,7 +1557,7 @@ Status NativeProcessAIX::GetLoadedModuleFileSpec(const char *module_path,
       return Status();
     }
   }
-  return Status("Module file (%s) not found in /proc/%" PRIu64 "/maps file!",
+  return Status::FromErrorStringWithFormat("Module file (%s) not found in /proc/%" PRIu64 "/maps file!",
                 module_file_spec.GetFilename().AsCString(), GetID());
 }
 
@@ -2011,7 +2013,7 @@ Status NativeProcessAIX::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
   }
 
   if (errno) {
-    error.SetErrorToErrno();
+    error = Status::FromErrno();
     ret = -1;
   }
 
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
index 0859f9501c1b6..071e55543cc3c 100644
--- a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX.cpp
@@ -27,7 +27,7 @@ Status NativeRegisterContextAIX::ReadRegisterRaw(uint32_t reg_index,
                                                    RegisterValue &reg_value) {
   const RegisterInfo *const reg_info = GetRegisterInfoAtIndex(reg_index);
   if (!reg_info)
-    return Status("register %" PRIu32 " not found", reg_index);
+    return Status::FromErrorStringWithFormat("register %" PRIu32 " not found", reg_index);
 
   return DoReadRegisterValue(GetPtraceOffset(reg_index), reg_info->name,
                              reg_info->byte_size, reg_value);
@@ -82,7 +82,7 @@ NativeRegisterContextAIX::WriteRegisterRaw(uint32_t reg_index,
   assert(register_to_write_info_p &&
          "register to write does not have valid RegisterInfo");
   if (!register_to_write_info_p)
-    return Status("NativeRegisterContextAIX::%s failed to get RegisterInfo "
+    return Status::FromErrorStringWithFormat("NativeRegisterContextAIX::%s failed to get RegisterInfo "
                   "for write register index %" PRIu32,
                   __FUNCTION__, reg_to_write);
 
diff --git a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
index 1996373791748..0132b52dec6f2 100644
--- a/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeRegisterContextAIX_ppc64.cpp
@@ -165,7 +165,7 @@ Status NativeRegisterContextAIX_ppc64::ReadRegister(
   Status error;
 
   if (!reg_info) {
-    error.SetErrorString("reg_info NULL");
+    error.FromErrorString("reg_info NULL");
     return error;
   }
 
@@ -251,7 +251,7 @@ Status NativeRegisterContextAIX_ppc64::WriteRegister(
 
   const uint32_t reg_index = reg_info->kinds[lldb::eRegisterKindLLDB];
   if (reg_index == LLDB_INVALID_REGNUM)
-    return Status("no lldb regnum for %s", reg_info && reg_info->name
+    return Status::FromErrorStringWithFormat("no lldb regnum for %s", reg_info && reg_info->name
                                                ? reg_info->name
                                                : "<unknown register>");
 
@@ -389,14 +389,14 @@ Status NativeRegisterContextAIX_ppc64::WriteAllRegisterValues(
   Status error;
 
   if (!data_sp) {
-    error.SetErrorStringWithFormat(
+    error = Status::FromErrorStringWithFormat( 
         "NativeRegisterContextAIX_ppc64::%s invalid data_sp provided",
         __FUNCTION__);
     return error;
   }
 
   if (data_sp->GetByteSize() != REG_CONTEXT_SIZE) {
-    error.SetErrorStringWithFormat(
+    error = Status::FromErrorStringWithFormat(
         "NativeRegisterContextAIX_ppc64::%s data_sp contained mismatched "
         "data size, expected %" PRIu64 ", actual %" PRIu64,
         __FUNCTION__, REG_CONTEXT_SIZE, data_sp->GetByteSize());
@@ -405,7 +405,7 @@ Status NativeRegisterContextAIX_ppc64::WriteAllRegisterValues(
 
   const uint8_t *src = data_sp->GetBytes();
   if (src == nullptr) {
-    error.SetErrorStringWithFormat("NativeRegisterContextAIX_ppc64::%s "
+    error = Status::FromErrorStringWithFormat("NativeRegisterContextAIX_ppc64::%s "
                                    "DataBuffer::GetBytes() returned a null "
                                    "pointer",
                                    __FUNCTION__);
diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
index e07daccdff550..bb14b6ab4a05e 100644
--- a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
@@ -481,7 +481,7 @@ Status NativeThreadAIX::RequestStop() {
   Status err;
   errno = 0;
   if (::kill(pid, SIGSTOP) != 0) {
-    err.SetErrorToErrno();
+    err = Status::FromErrno();
     LLDB_LOGF(log,
               "NativeThreadAIX::%s kill(%" PRIu64 ", SIGSTOP) failed: %s",
               __FUNCTION__, pid, err.AsCString());
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 17926f8e4ab53..0aa68a4a09cbe 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -1728,11 +1728,11 @@ Status GDBRemoteCommunicationClient::GetLDXINFO(struct ld_xinfo *info_ptr)
     llvm::MutableArrayRef<uint8_t> infoData((uint8_t *)info_ptr, sizeof(struct ld_xinfo)*64);
     size_t got_bytes = response.GetHexBytesAvail(infoData);
     if (got_bytes != sizeof(struct ld_xinfo)*64) {
-      error.SetErrorString("qLDXINFO ret bad size");
+      error.FromErrorString("qLDXINFO ret bad size");
       return error;
     }
   } else {
-    error.SetErrorString("qLDXINFO is not supported");
+    error.FromErrorString("qLDXINFO is not supported");
   }
   return error;
 }

From 33d561f4bb74a2efd0da163ebde416c9ad1c2925 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Tue, 10 Sep 2024 02:00:09 -0500
Subject: [PATCH 06/58] Removed non-required PTRACE defs

---
 lldb/include/lldb/Host/aix/Ptrace.h | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/Ptrace.h b/lldb/include/lldb/Host/aix/Ptrace.h
index 88928f18102d7..393928a89add3 100644
--- a/lldb/include/lldb/Host/aix/Ptrace.h
+++ b/lldb/include/lldb/Host/aix/Ptrace.h
@@ -34,29 +34,11 @@
 #ifndef PTRACE_SETREGSET
 #define PTRACE_SETREGSET 0x4205
 #endif
-#ifndef PTRACE_GET_THREAD_AREA
-#define PTRACE_GET_THREAD_AREA  (PT_COMMAND_MAX+5)
-#endif
-#ifndef PTRACE_ARCH_PRCTL
-#define PTRACE_ARCH_PRCTL       (PT_COMMAND_MAX+6)
-#endif
-#ifndef ARCH_GET_FS
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
-#endif
-#ifndef PTRACE_PEEKMTETAGS
-#define PTRACE_PEEKMTETAGS      (PT_COMMAND_MAX+7)
-#endif
-#ifndef PTRACE_POKEMTETAGS
-#define PTRACE_POKEMTETAGS      (PT_COMMAND_MAX+8)
-#endif
 #ifndef PTRACE_GETVRREGS
-#define PTRACE_GETVRREGS        (PT_COMMAND_MAX+9)
+#define PTRACE_GETVRREGS        (PT_COMMAND_MAX+5)
 #endif
 #ifndef PTRACE_GETVSRREGS
-#define PTRACE_GETVSRREGS       (PT_COMMAND_MAX+10)
+#define PTRACE_GETVSRREGS       (PT_COMMAND_MAX+6)
 #endif
 
 #endif // liblldb_Host_aix_Ptrace_h_

From 450793d7270999ecdd6714c4222663517dab3928 Mon Sep 17 00:00:00 2001
From: Lakshmi Surekha Kovvuri <lakshmi@aixbase.aus.stglabs.ibm.com>
Date: Wed, 11 Sep 2024 02:52:41 -0500
Subject: [PATCH 07/58] Patch for running of unit testcases without hang

---
 lldb/unittests/Host/MainLoopTest.cpp | 4 +++-
 lldb/unittests/Host/PipeTest.cpp     | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp
index 4084e90782fd5..9e92ec1470d4d 100644
--- a/lldb/unittests/Host/MainLoopTest.cpp
+++ b/lldb/unittests/Host/MainLoopTest.cpp
@@ -183,7 +183,7 @@ TEST_F(MainLoopTest, PendingCallbackAfterLoopExited) {
     loop.AddPendingCallback([&](MainLoopBase &loop) {});
 }
 
-#ifdef LLVM_ON_UNIX
+#if defined(LLVM_ON_UNIX) && !defined(__AIX__)
 TEST_F(MainLoopTest, DetectsEOF) {
 
   PseudoTerminal term;
@@ -202,7 +202,9 @@ TEST_F(MainLoopTest, DetectsEOF) {
   ASSERT_TRUE(loop.Run().Success());
   ASSERT_EQ(1u, callback_count);
 }
+// #endif
 
+// #ifdef LLVM_ON_UNIX
 TEST_F(MainLoopTest, Signal) {
   MainLoop loop;
   Status error;
diff --git a/lldb/unittests/Host/PipeTest.cpp b/lldb/unittests/Host/PipeTest.cpp
index 506f3d225a21e..c1013aa7a7e4e 100644
--- a/lldb/unittests/Host/PipeTest.cpp
+++ b/lldb/unittests/Host/PipeTest.cpp
@@ -55,6 +55,7 @@ TEST_F(PipeTest, OpenAsReader) {
 }
 #endif
 
+#if !defined(__AIX__)
 TEST_F(PipeTest, WriteWithTimeout) {
   Pipe pipe;
   ASSERT_THAT_ERROR(pipe.CreateNew(false).ToError(), llvm::Succeeded());
@@ -150,3 +151,4 @@ TEST_F(PipeTest, WriteWithTimeout) {
                         .ToError(),
                     llvm::Succeeded());
 }
+#endif

From 61e7843b431ff3657e3c4b39d1559401ff3de891 Mon Sep 17 00:00:00 2001
From: Lakshmi Surekha Kovvuri <lakshmi@aixbase.aus.stglabs.ibm.com>
Date: Thu, 12 Sep 2024 13:22:03 -0500
Subject: [PATCH 08/58] changes applied to NativeProcessAIX.cpp file to solve
 build errors while making LLDB up to date

---
 .../Plugins/Process/AIX/NativeProcessAIX.cpp     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
index 5b01a66b0453f..fc84763857453 100644
--- a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -862,7 +862,7 @@ Status NativeProcessAIX::Detach() {
     Status e = Detach(thread->GetID());
     if (e.Fail())
       error =
-          e; // Save the error, but still attempt to detach from other threads.
+          e.Clone(); // Save the error, but still attempt to detach from other threads.
   }
 
   return error;
@@ -1240,7 +1240,7 @@ Status NativeProcessAIX::ReadMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<NativeRegisterContextAIX::MemoryTaggingDetails> details =
       GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
   if (!details)
-    return Status(details.takeError());
+    return Status::FromError(details.takeError());
 
   // Ignore 0 length read
   if (!len)
@@ -1295,7 +1295,7 @@ Status NativeProcessAIX::WriteMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<NativeRegisterContextAIX::MemoryTaggingDetails> details =
       GetCurrentThread()->GetRegisterContext().GetMemoryTaggingDetails(type);
   if (!details)
-    return Status(details.takeError());
+    return Status::FromError(details.takeError());
 
   // Ignore 0 length write
   if (!len)
@@ -1312,18 +1312,18 @@ Status NativeProcessAIX::WriteMemoryTags(int32_t type, lldb::addr_t addr,
   llvm::Expected<std::vector<lldb::addr_t>> unpacked_tags_or_err =
       details->manager->UnpackTagsData(tags);
   if (!unpacked_tags_or_err)
-    return Status(unpacked_tags_or_err.takeError());
+    return Status::FromError(unpacked_tags_or_err.takeError());
 
   llvm::Expected<std::vector<lldb::addr_t>> repeated_tags_or_err =
       details->manager->RepeatTagsForRange(*unpacked_tags_or_err, range);
   if (!repeated_tags_or_err)
-    return Status(repeated_tags_or_err.takeError());
+    return Status::FromError(repeated_tags_or_err.takeError());
 
   // Repack them for ptrace to use
   llvm::Expected<std::vector<uint8_t>> final_tag_data =
       details->manager->PackTags(*repeated_tags_or_err);
   if (!final_tag_data)
-    return Status(final_tag_data.takeError());
+    return Status::FromError(final_tag_data.takeError());
 
   struct iovec tags_vec;
   uint8_t *src = final_tag_data->data();
@@ -1609,13 +1609,13 @@ Status NativeProcessAIX::ResumeThread(NativeThreadAIX &thread,
   // reflect it is running after this completes.
   switch (state) {
   case eStateRunning: {
-    const auto resume_result = thread.Resume(signo);
+    Status resume_result = thread.Resume(signo);
     if (resume_result.Success())
       SetState(eStateRunning, true);
     return resume_result;
   }
   case eStateStepping: {
-    const auto step_result = thread.SingleStep(signo);
+    Status step_result = thread.SingleStep(signo);
     if (step_result.Success())
       SetState(eStateRunning, true);
     return step_result;

From 627a5427daba3fc5ea03ae481874f4aa1b4d2ed0 Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Fri, 13 Sep 2024 16:25:47 +0530
Subject: [PATCH 09/58] Revert "Removed non-required PTRACE defs"

---
 lldb/include/lldb/Host/aix/Ptrace.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/Ptrace.h b/lldb/include/lldb/Host/aix/Ptrace.h
index 393928a89add3..88928f18102d7 100644
--- a/lldb/include/lldb/Host/aix/Ptrace.h
+++ b/lldb/include/lldb/Host/aix/Ptrace.h
@@ -34,11 +34,29 @@
 #ifndef PTRACE_SETREGSET
 #define PTRACE_SETREGSET 0x4205
 #endif
+#ifndef PTRACE_GET_THREAD_AREA
+#define PTRACE_GET_THREAD_AREA  (PT_COMMAND_MAX+5)
+#endif
+#ifndef PTRACE_ARCH_PRCTL
+#define PTRACE_ARCH_PRCTL       (PT_COMMAND_MAX+6)
+#endif
+#ifndef ARCH_GET_FS
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+#endif
+#ifndef PTRACE_PEEKMTETAGS
+#define PTRACE_PEEKMTETAGS      (PT_COMMAND_MAX+7)
+#endif
+#ifndef PTRACE_POKEMTETAGS
+#define PTRACE_POKEMTETAGS      (PT_COMMAND_MAX+8)
+#endif
 #ifndef PTRACE_GETVRREGS
-#define PTRACE_GETVRREGS        (PT_COMMAND_MAX+5)
+#define PTRACE_GETVRREGS        (PT_COMMAND_MAX+9)
 #endif
 #ifndef PTRACE_GETVSRREGS
-#define PTRACE_GETVSRREGS       (PT_COMMAND_MAX+6)
+#define PTRACE_GETVSRREGS       (PT_COMMAND_MAX+10)
 #endif
 
 #endif // liblldb_Host_aix_Ptrace_h_

From ea34b15d8568b4639b4e850ef032e684d82dd971 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 10 Oct 2024 00:38:18 -0500
Subject: [PATCH 10/58] Replaced __AIX__ with _AIX

---
 clang/test/SemaCXX/class-layout.cpp                       | 2 +-
 lldb/CMakeLists.txt                                       | 2 +-
 lldb/include/lldb/Host/HostGetOpt.h                       | 2 +-
 lldb/include/lldb/Host/HostInfo.h                         | 2 +-
 lldb/include/lldb/Host/XML.h                              | 2 +-
 lldb/include/lldb/Host/common/GetOptInc.h                 | 4 ++--
 lldb/include/lldb/Target/Process.h                        | 6 +++---
 lldb/include/lldb/Target/RegisterContextUnwind.h          | 2 +-
 lldb/source/Core/Mangled.cpp                              | 2 +-
 lldb/source/Core/Section.cpp                              | 2 +-
 lldb/source/Host/common/Host.cpp                          | 4 ++--
 lldb/source/Host/common/XML.cpp                           | 2 +-
 lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp  | 2 +-
 lldb/source/Host/posix/FileSystemPosix.cpp                | 2 +-
 lldb/source/Host/posix/MainLoopPosix.cpp                  | 4 ++--
 lldb/source/Host/posix/ProcessLauncherPosixFork.cpp       | 2 +-
 lldb/source/Initialization/SystemInitializerCommon.cpp    | 4 ++--
 lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp         | 4 ++--
 .../DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp       | 6 +++---
 .../Plugins/Instruction/PPC64/EmulateInstructionPPC64.h   | 2 +-
 lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp        | 2 +-
 lldb/source/Plugins/Language/ObjC/Cocoa.cpp               | 2 +-
 .../BSD-Archive/ObjectContainerBSDArchive.cpp             | 2 +-
 .../Big-Archive/ObjectContainerBigArchive.cpp             | 2 +-
 .../Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp    | 2 +-
 lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp      | 6 +++---
 .../source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp | 6 +++---
 lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp          | 6 +++---
 lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp | 4 ++--
 .../Process/gdb-remote/GDBRemoteCommunicationClient.cpp   | 4 ++--
 .../Process/gdb-remote/GDBRemoteCommunicationClient.h     | 4 ++--
 .../gdb-remote/GDBRemoteCommunicationServerLLGS.cpp       | 4 ++--
 .../Plugins/Process/gdb-remote/ProcessGDBRemote.cpp       | 4 ++--
 lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h | 4 ++--
 lldb/source/Target/Process.cpp                            | 4 ++--
 lldb/source/Target/RegisterContextUnwind.cpp              | 8 ++++----
 lldb/source/Target/UnwindLLDB.cpp                         | 4 ++--
 lldb/tools/driver/Driver.cpp                              | 4 ++--
 lldb/tools/lldb-server/SystemInitializerLLGS.cpp          | 8 ++++----
 lldb/tools/lldb-server/lldb-gdbserver.cpp                 | 4 ++--
 lldb/unittests/Host/MainLoopTest.cpp                      | 2 +-
 lldb/unittests/Host/PipeTest.cpp                          | 2 +-
 lldb/unittests/Host/posix/TerminalTest.cpp                | 4 ++--
 43 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/clang/test/SemaCXX/class-layout.cpp b/clang/test/SemaCXX/class-layout.cpp
index 22fb34b8419c5..0931d905a9749 100644
--- a/clang/test/SemaCXX/class-layout.cpp
+++ b/clang/test/SemaCXX/class-layout.cpp
@@ -639,7 +639,7 @@ namespace PR37275 {
 #pragma pack(pop)
 }
 
-#endif // !defined(__MVS__) && !defined(__AIX__)
+#endif // !defined(__MVS__) && !defined(_AIX)
 
 namespace non_pod {
 struct t1 {
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 2e9ae0d0b3221..a4fd8bccf056d 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -39,7 +39,7 @@ include(LLDBConfig)
 include(AddLLDB)
 
 if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-  add_definitions("-D__AIX__")
+  add_definitions("-D_AIX")
 endif()
 
 # Define the LLDB_CONFIGURATION_xxx matching the build type.
diff --git a/lldb/include/lldb/Host/HostGetOpt.h b/lldb/include/lldb/Host/HostGetOpt.h
index f450e561d6afb..b2b436e64a692 100644
--- a/lldb/include/lldb/Host/HostGetOpt.h
+++ b/lldb/include/lldb/Host/HostGetOpt.h
@@ -9,7 +9,7 @@
 #ifndef LLDB_HOST_HOSTGETOPT_H
 #define LLDB_HOST_HOSTGETOPT_H
 
-#if !defined(_MSC_VER) && !defined(__NetBSD__) && !defined(__AIX__)
+#if !defined(_MSC_VER) && !defined(__NetBSD__) && !defined(_AIX)
 
 #include <getopt.h>
 #include <unistd.h>
diff --git a/lldb/include/lldb/Host/HostInfo.h b/lldb/include/lldb/Host/HostInfo.h
index 156df8cf6901d..0f7ec0e0aa0d2 100644
--- a/lldb/include/lldb/Host/HostInfo.h
+++ b/lldb/include/lldb/Host/HostInfo.h
@@ -55,7 +55,7 @@
 #elif defined(__APPLE__)
 #include "lldb/Host/macosx/HostInfoMacOSX.h"
 #define HOST_INFO_TYPE HostInfoMacOSX
-#elif defined(__AIX__)
+#elif defined(_AIX)
 #include "lldb/Host/aix/HostInfoAIX.h"
 #define HOST_INFO_TYPE HostInfoAIX
 #else
diff --git a/lldb/include/lldb/Host/XML.h b/lldb/include/lldb/Host/XML.h
index cf359f7726d5d..483589f1abc75 100644
--- a/lldb/include/lldb/Host/XML.h
+++ b/lldb/include/lldb/Host/XML.h
@@ -11,7 +11,7 @@
 
 #include "lldb/Host/Config.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 //FIXME for AIX
 #undef LLDB_ENABLE_LIBXML2
 #endif
diff --git a/lldb/include/lldb/Host/common/GetOptInc.h b/lldb/include/lldb/Host/common/GetOptInc.h
index ebb475bfaf6b8..652e6174ff8b6 100644
--- a/lldb/include/lldb/Host/common/GetOptInc.h
+++ b/lldb/include/lldb/Host/common/GetOptInc.h
@@ -11,11 +11,11 @@
 
 #include "lldb/lldb-defines.h"
 
-#if defined(_MSC_VER) || defined(__AIX__)
+#if defined(_MSC_VER) || defined(_AIX)
 #define REPLACE_GETOPT
 #define REPLACE_GETOPT_LONG
 #endif
-#if defined(_MSC_VER) || defined(__NetBSD__) || defined(__AIX__)
+#if defined(_MSC_VER) || defined(__NetBSD__) || defined(_AIX)
 #define REPLACE_GETOPT_LONG_ONLY
 #endif
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 4a47ffd8d779d..d1527d316d678 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -65,7 +65,7 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/VersionTuple.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 struct ld_xinfo;
 #endif
 
@@ -1884,7 +1884,7 @@ class Process : public std::enable_shared_from_this<Process>,
   Status GetMemoryRegionInfo(lldb::addr_t load_addr,
                              MemoryRegionInfo &range_info);
 
-#if defined(__AIX__)
+#if defined(_AIX)
   Status GetLDXINFO(struct ld_xinfo *info_ptr);
 #endif
 
@@ -2823,7 +2823,7 @@ void PruneThreadPlans();
         "Process::DoGetMemoryRegionInfo() not supported");
   }
 
-#if defined(__AIX__)
+#if defined(_AIX)
   virtual Status DoGetLDXINFO(struct ld_xinfo *info_ptr) {
     return Status("Process::DoGetLDXINFO() not supported");
   }
diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h
index 46c06cb422caf..b6176f8e5727f 100644
--- a/lldb/include/lldb/Target/RegisterContextUnwind.h
+++ b/lldb/include/lldb/Target/RegisterContextUnwind.h
@@ -67,7 +67,7 @@ class RegisterContextUnwind : public lldb_private::RegisterContext {
 
   bool ReadPC(lldb::addr_t &start_pc);
 
-#ifdef __AIX__
+#ifdef _AIX
   bool ReadLR(lldb::addr_t &lr);
 #endif
 
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 43c5b043ef7a2..8f2e3562f6577 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -167,7 +167,7 @@ static char *GetItaniumDemangledStr(const char *M) {
            "Expected demangled_size to return length including trailing null");
   }
 
-#if !defined(__AIX__)  
+#if !defined(_AIX)  
   if (Log *log = GetLog(LLDBLog::Demangle)) {
     if (demangled_cstr)
       LLDB_LOGF(log, "demangled itanium: %s -> \"%s\"", M, demangled_cstr);
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 9ed55853930a6..e0a9f7fcc7135 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -263,7 +263,7 @@ bool Section::ResolveContainedAddress(addr_t offset, Address &so_addr,
 
 bool Section::ContainsFileAddress(addr_t vm_addr) const {
   const addr_t file_addr = GetFileAddress();
-#ifdef __AIX__
+#ifdef _AIX
   if (file_addr == 0)
     return false;
 #endif
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 94b1d0fd57d07..dc48cb87b5ce6 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -358,7 +358,7 @@ bool Host::ResolveExecutableInBundle(FileSpec &file) { return false; }
 
 #ifndef _WIN32
 
-#if defined(__AIX__)
+#if defined(_AIX)
 
 #include <stdio.h>
 extern char **p_xargv;
@@ -525,7 +525,7 @@ static int dladdr(const void *ptr, Dl_info *dl)
 FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
   FileSpec module_filespec;
 #if !defined(__ANDROID__)
-#ifdef __AIX__
+#ifdef _AIX
   if (host_addr == reinterpret_cast<void *>(HostInfoBase::ComputeSharedLibraryDirectory)) {
     // FIXME: AIX dladdr return "lldb" for this case
     if (p_xargv[0]) {
diff --git a/lldb/source/Host/common/XML.cpp b/lldb/source/Host/common/XML.cpp
index 62cac78aaac23..fbc409105fe60 100644
--- a/lldb/source/Host/common/XML.cpp
+++ b/lldb/source/Host/common/XML.cpp
@@ -10,7 +10,7 @@
 #include "lldb/Host/XML.h"
 
 #include "llvm/ADT/StringExtras.h"
-#if defined(__AIX__)
+#if defined(_AIX)
 #undef LLDB_ENABLE_LIBXML2
 #endif
 
diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index bd204c812b7e3..09c3fd2af6d3e 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -722,7 +722,7 @@ ConnectionFileDescriptor::ConnectFD(llvm::StringRef s,
 ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     llvm::StringRef s, socket_id_callback_type socket_id_callback,
     Status *error_ptr) {
-#if !defined(__AIX__)
+#if !defined(_AIX)
 #if LLDB_ENABLE_POSIX
   std::string addr_str = s.str();
   // file:///PATH
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 866fd8ac96c7b..21da5612ff6b8 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,7 +11,7 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
-#if !defined(__AIX__)
+#if !defined(_AIX)
 #include <sys/mount.h>
 #endif
 #include <sys/param.h>
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index d1eba52791a78..015570236b9d3 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -179,7 +179,7 @@ Status MainLoopPosix::RunImpl::Poll() {
     read_fds.push_back(pfd);
   }
 
-#if defined(__AIX__)
+#if defined(_AIX)
   sigset_t origmask;
   int timeout;
 
@@ -325,7 +325,7 @@ MainLoopPosix::RegisterSignal(int signo, const Callback &callback,
   // If we're using kqueue, the signal needs to be unblocked in order to
   // receive it. If using pselect/ppoll, we need to block it, and later unblock
   // it as a part of the system call.
-#if defined(__AIX__)
+#if defined(_AIX)
   //FIXME: where is signal unblocked?
   ret = pthread_sigmask(SIG_UNBLOCK, &new_action.sa_mask, &old_set);
 #else
diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index b8a96fbd19f02..f9f99decd39c2 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -193,7 +193,7 @@ struct ForkLaunchInfo {
     }
 
     // Start tracing this child that is about to exec.
-#if !defined(__AIX__)
+#if !defined(_AIX)
     if (ptrace(PT_TRACE_ME, 0, nullptr, 0) == -1)
       ExitWithError(error_fd, "ptrace");
 #else
diff --git a/lldb/source/Initialization/SystemInitializerCommon.cpp b/lldb/source/Initialization/SystemInitializerCommon.cpp
index 4b01442a94bac..2e2d622d9981c 100644
--- a/lldb/source/Initialization/SystemInitializerCommon.cpp
+++ b/lldb/source/Initialization/SystemInitializerCommon.cpp
@@ -19,7 +19,7 @@
 #include "lldb/Version/Version.h"
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) ||       \
-    defined(__OpenBSD__) || defined(__AIX__)
+    defined(__OpenBSD__) || defined(_AIX)
 #include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
 #endif
 
@@ -79,7 +79,7 @@ llvm::Error SystemInitializerCommon::Initialize() {
   process_gdb_remote::ProcessGDBRemoteLog::Initialize();
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) ||       \
-    defined(__OpenBSD__) || defined(__AIX__)
+    defined(__OpenBSD__) || defined(_AIX)
   ProcessPOSIXLog::Initialize();
 #endif
 #if defined(_WIN32)
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index 88a82f4a0d20c..a3abb15ee625b 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -156,7 +156,7 @@ bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
   if (!reg_ctx->WriteRegisterFromUnsigned(r12_reg_info, func_addr))
     return false;
 
-#if defined(__AIX__)
+#if defined(_AIX)
   assert(0);
 #else
   // Read TOC pointer value.
@@ -279,7 +279,7 @@ bool ABISysV_ppc64::PrepareTrivialCall(Thread &thread, addr_t sp,
   if (!reg_ctx->WriteRegisterFromUnsigned(r12_reg_info, func_addr))
     return false;
 
-#if defined(__AIX__)
+#if defined(_AIX)
   LLDB_LOGF(log, "Writing R2: 0x%" PRIx64, (uint64_t)toc_addr);
   if (!reg_ctx->WriteRegisterFromUnsigned(r2_reg_info, toc_addr))
     return false;
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 62663974134b0..7f3a638d5b028 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -18,7 +18,7 @@
 #include "lldb/Target/ThreadPlanStepInstruction.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/ldr.h>
 #endif
 
@@ -160,7 +160,7 @@ void DynamicLoaderAIXDYLD::DidAttach() {
   auto error = m_process->LoadModules();
   LLDB_LOG_ERROR(log, std::move(error), "failed to load modules: {0}");
 
-#if defined(__AIX__)
+#if defined(_AIX)
   // Get struct ld_xinfo (FIXME)
   struct ld_xinfo ldinfo[64];
   Status status = m_process->GetLDXINFO(&(ldinfo[0]));
@@ -221,7 +221,7 @@ void DynamicLoaderAIXDYLD::DidLaunch() {
     LLDB_LOG_ERROR(log, std::move(error), "failed to load modules: {0}");
   }
 
-#if defined(__AIX__)
+#if defined(_AIX)
   // Get struct ld_xinfo (FIXME)
   struct ld_xinfo ldinfo[64];
   Status status = m_process->GetLDXINFO(&(ldinfo[0]));
diff --git a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
index 1576c9700e557..d98b2880ca3b4 100644
--- a/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
+++ b/lldb/source/Plugins/Instruction/PPC64/EmulateInstructionPPC64.h
@@ -39,7 +39,7 @@ class EmulateInstructionPPC64 : public EmulateInstruction {
       return true;
 
     case eInstructionTypePCModifying:
-#if defined(__AIX__)
+#if defined(_AIX)
       return true;
 #else
       return false;
diff --git a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
index 690fb0d60a09a..9a52fb2f2adc5 100644
--- a/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
+++ b/lldb/source/Plugins/JITLoader/GDB/JITLoaderGDB.cpp
@@ -194,7 +194,7 @@ void JITLoaderGDB::SetJITBreakpoint(lldb_private::ModuleList &module_list) {
   if (jit_addr == LLDB_INVALID_ADDRESS)
     return;
 
-#if defined(__AIX__)
+#if defined(_AIX)
   return;
 #endif
 
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index fb5bc2c58e6fb..71f2b127afb12 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -1227,7 +1227,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
 time_t lldb_private::formatters::GetOSXEpoch() {
   static time_t epoch = 0;
   if (!epoch) {
-#if !defined(__AIX__)
+#if !defined(_AIX)
 #ifndef _WIN32
     tzset();
     tm tm_epoch;
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 5ea55772c3aba..4f747ab20c9ef 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -8,7 +8,7 @@
 
 #include "ObjectContainerBSDArchive.h"
 
-#if defined(_WIN32) || defined(__ANDROID__) || defined(__AIX__)
+#if defined(_WIN32) || defined(__ANDROID__) || defined(_AIX)
 // Defines from ar, missing on Windows
 #define SARMAG 8
 #define ARFMAG "`\n"
diff --git a/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
index 050ad73f1d19a..38756a0dd2969 100644
--- a/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/Big-Archive/ObjectContainerBigArchive.cpp
@@ -8,7 +8,7 @@
 
 #include "ObjectContainerBigArchive.h"
 
-#if defined(_WIN32) || defined(__ANDROID__) || defined(__AIX__)
+#if defined(_WIN32) || defined(__ANDROID__) || defined(_AIX)
 // Defines from ar, missing on Windows
 #define ARMAG "!<arch>\n"
 #define SARMAG 8
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index d8834af2c33ef..9aab76c6c48ba 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -51,7 +51,7 @@ size_t ObjectFileMinidump::GetModuleSpecifications(
     const lldb_private::FileSpec &file, lldb::DataBufferSP &data_sp,
     lldb::offset_t data_offset, lldb::offset_t file_offset,
     lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
-#if !defined(__AIX__)
+#if !defined(_AIX)
   specs.Clear();
 #endif
   return 0;
diff --git a/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp b/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
index 75cc54e4f0d48..d76d6adb1be2c 100644
--- a/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
+++ b/lldb/source/Plugins/ObjectFile/PDB/ObjectFilePDB.cpp
@@ -117,7 +117,7 @@ size_t ObjectFilePDB::GetModuleSpecifications(
   llvm::BumpPtrAllocator allocator;
   std::unique_ptr<PDBFile> pdb_file = loadPDBFile(file.GetPath(), allocator);
   if (!pdb_file){
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
@@ -127,7 +127,7 @@ size_t ObjectFilePDB::GetModuleSpecifications(
   auto info_stream = pdb_file->getPDBInfoStream();
   if (!info_stream) {
     llvm::consumeError(info_stream.takeError());
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
@@ -136,7 +136,7 @@ size_t ObjectFilePDB::GetModuleSpecifications(
   auto dbi_stream = pdb_file->getPDBDbiStream();
   if (!dbi_stream) {
     llvm::consumeError(dbi_stream.takeError());
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 519ce2ca4a0b2..02a86234bd363 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -254,7 +254,7 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
     lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
   const size_t initial_count = specs.GetSize();
   if (!data_sp || !ObjectFilePECOFF::MagicBytesMatch(data_sp)){
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
@@ -272,7 +272,7 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
   if (!binary) {
     LLDB_LOG_ERROR(log, binary.takeError(),
                    "Failed to create binary for file ({1}): {0}", file);
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
@@ -281,7 +281,7 @@ size_t ObjectFilePECOFF::GetModuleSpecifications(
 
   auto *COFFObj = llvm::dyn_cast<llvm::object::COFFObjectFile>(binary->get());
   if (!COFFObj){
-#if !defined(__AIX__)
+#if !defined(_AIX)
     return initial_count;
 #else
     return specs.GetSize() - initial_count;
diff --git a/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
index b6b08b73bec41..5c94477002978 100644
--- a/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
+++ b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
@@ -31,7 +31,7 @@
 // Define these constants from AIX mman.h for use when targeting remote aix
 // systems even when host has different values.
 
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/mman.h>
 #endif
 
@@ -80,7 +80,7 @@ void PlatformAIX::Initialize() {
   PlatformPOSIX::Initialize();
 
   if (g_initialize_count++ == 0) {
-#if defined(__AIX__)
+#if defined(_AIX)
     PlatformSP default_platform_sp(new PlatformAIX(true));
     default_platform_sp->SetSystemArchitecture(HostInfo::GetArchitecture());
     Platform::SetHostPlatform(default_platform_sp);
@@ -294,7 +294,7 @@ MmapArgList PlatformAIX::GetMmapArgumentList(const ArchSpec &arch,
                                                addr_t addr, addr_t length,
                                                unsigned prot, unsigned flags,
                                                addr_t fd, addr_t offset) {
-#if defined(__AIX__)
+#if defined(_AIX)
   unsigned flags_platform = MAP_VARIABLE | MAP_PRIVATE | MAP_ANONYMOUS;
 #else
   unsigned flags_platform = 0;
diff --git a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
index db271357d792a..ea758caa653a1 100644
--- a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
+++ b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
@@ -46,7 +46,7 @@ bool lldb_private::InferiorCallMmap(Process *process, addr_t &allocated_addr,
   function_options.include_inlines = false;
 
   SymbolContextList sc_list;
-#if !defined(__AIX__)
+#if !defined(_AIX)
   process->GetTarget().GetImages().FindFunctions(
       ConstString("mmap"), eFunctionNameTypeFull, function_options, sc_list);
 #else
@@ -122,7 +122,7 @@ bool lldb_private::InferiorCallMmap(Process *process, addr_t &allocated_addr,
         MmapArgList args =
             process->GetTarget().GetPlatform()->GetMmapArgumentList(
                 arch, addr, length, prot_arg, flags, fd, offset);
-#if defined(__AIX__)
+#if defined(_AIX)
         lldb::ThreadPlanSP call_plan_sp(
             new ThreadPlanCallFunction(*thread, mmap_range.GetBaseAddress(),
                                        toc_range.GetBaseAddress(),
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 443b7c7b2c7fb..fa0a3b5d4dc38 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -41,7 +41,7 @@
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_ZLIB
 #include "llvm/Support/JSON.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/ldr.h>
 #endif
 
@@ -1715,7 +1715,7 @@ Status GDBRemoteCommunicationClient::GetMemoryRegionInfo(
   return error;
 }
 
-#if defined(__AIX__)
+#if defined(_AIX)
 Status GDBRemoteCommunicationClient::GetLDXINFO(struct ld_xinfo *info_ptr)
 {
   Status error;
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 520f37ac56716..1812fc9b7ca65 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -32,7 +32,7 @@
 
 #include "llvm/Support/VersionTuple.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 struct ld_xinfo;
 #endif
 
@@ -200,7 +200,7 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   Status GetMemoryRegionInfo(lldb::addr_t addr, MemoryRegionInfo &range_info);
 
   std::optional<uint32_t> GetWatchpointSlotCount();
-#if defined(__AIX__)
+#if defined(_AIX)
   Status GetLDXINFO(struct ld_xinfo *info_ptr);
 #endif
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 4f1ef0898ba08..27be61a474238 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -48,7 +48,7 @@
 #include "ProcessGDBRemote.h"
 #include "ProcessGDBRemoteLog.h"
 #include "lldb/Utility/StringExtractorGDBRemote.h"
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/ldr.h>
 #endif
 
@@ -3011,7 +3011,7 @@ GDBRemoteCommunicationServerLLGS::Handle_qLDXINFO(StringExtractorGDBRemote &pack
     return SendErrorResponse(0xff);
   }
 
-#if defined(__AIX__)
+#if defined(_AIX)
   // FIXME: buffer size
   struct ld_xinfo info[64];
   if (ptrace64(PT_LDXINFO, m_current_process->GetID(), (long long)&(info[0]), sizeof(info), nullptr) != 0) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index ca381290d0e9f..5b7ce5f1424d9 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -92,7 +92,7 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/ldr.h>
 #endif
 
@@ -2963,7 +2963,7 @@ Status ProcessGDBRemote::DoGetMemoryRegionInfo(addr_t load_addr,
   return error;
 }
 
-#if defined(__AIX__)
+#if defined(_AIX)
 Status ProcessGDBRemote::DoGetLDXINFO(struct ld_xinfo *info_ptr) {
   Status error(m_gdb_comm.GetLDXINFO(info_ptr));
   return error;
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 82200fbea21cd..2bf3a04d213d4 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -37,7 +37,7 @@
 #include "GDBRemoteCommunicationClient.h"
 #include "GDBRemoteRegisterContext.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 struct ld_xinfo;
 #endif
 
@@ -427,7 +427,7 @@ class ProcessGDBRemote : public Process,
   Status DoGetMemoryRegionInfo(lldb::addr_t load_addr,
                                MemoryRegionInfo &region_info) override;
 
-#if defined(__AIX__)
+#if defined(_AIX)
   Status DoGetLDXINFO(struct ld_xinfo *info_ptr) override;
 #endif
 
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index a9aef7ef21855..e6ae7fc559ef4 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -75,7 +75,7 @@
 #include "lldb/Utility/State.h"
 #include "lldb/Utility/Timer.h"
 
-#if defined(__AIX__)
+#if defined(_AIX)
 #include <sys/ldr.h>
 #endif
 
@@ -6188,7 +6188,7 @@ Status Process::GetMemoryRegionInfo(lldb::addr_t load_addr,
   return DoGetMemoryRegionInfo(load_addr, range_info);
 }
 
-#if defined(__AIX__)
+#if defined(_AIX)
 Status Process::GetLDXINFO(struct ld_xinfo *info_ptr) {
   return DoGetLDXINFO(info_ptr);
 }
diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index fbdbc8c63a5d0..fdf269a3d3653 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -40,7 +40,7 @@
 
 #include <cassert>
 #include <memory>
-#ifdef __AIX__
+#ifdef _AIX
 #include "Plugins/Process/Utility/lldb-ppc64le-register-enums.h"
 #endif
 
@@ -1260,7 +1260,7 @@ bool RegisterContextUnwind::IsTrapHandlerSymbol(
 // Answer the question: Where did THIS frame save the CALLER frame ("previous"
 // frame)'s register value?
 
-#ifdef __AIX__
+#ifdef _AIX
 extern bool UGLY_HACK_NULL_TOPFRAME;
 #endif
 
@@ -1525,7 +1525,7 @@ RegisterContextUnwind::SavedLocationForRegister(
       new_regloc.type =
           UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext;
       new_regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
-#ifdef __AIX__
+#ifdef _AIX
       if (UGLY_HACK_NULL_TOPFRAME && new_regloc.location.register_number == 0x20) {
         new_regloc.location.register_number = 0x24;
       }
@@ -2390,7 +2390,7 @@ bool RegisterContextUnwind::ReadPC(addr_t &pc) {
   }
 }
 
-#ifdef __AIX__
+#ifdef _AIX
 bool RegisterContextUnwind::ReadLR(addr_t &lr) {
   if (!IsValid())
     return false;
diff --git a/lldb/source/Target/UnwindLLDB.cpp b/lldb/source/Target/UnwindLLDB.cpp
index 8edf359cac497..764bea5bf86c6 100644
--- a/lldb/source/Target/UnwindLLDB.cpp
+++ b/lldb/source/Target/UnwindLLDB.cpp
@@ -68,7 +68,7 @@ uint32_t UnwindLLDB::DoGetFrameCount() {
   return m_frames.size();
 }
 
-#ifdef __AIX__
+#ifdef _AIX
 bool UGLY_HACK_NULL_TOPFRAME = false;
 #endif
 
@@ -95,7 +95,7 @@ bool UnwindLLDB::AddFirstFrame() {
   if (!reg_ctx_sp->ReadPC(first_cursor_sp->start_pc))
     goto unwind_done;
 
-#ifdef __AIX__
+#ifdef _AIX
   lldb::addr_t lr;
   if (!reg_ctx_sp->ReadLR(lr))
     goto unwind_done;
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 45837503e8b73..d17ed77485d31 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -640,7 +640,7 @@ void sigwinch_handler(int signo) {
 }
 
 void sigint_handler(int signo) {
-#if defined(_WIN32) || defined(__AIX__) // Restore handler as it is not persistent on Windows
+#if defined(_WIN32) || defined(_AIX) // Restore handler as it is not persistent on Windows
   signal(SIGINT, sigint_handler);
 #endif
   static std::atomic_flag g_interrupt_sent = ATOMIC_FLAG_INIT;
@@ -729,7 +729,7 @@ static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) {
 int main(int argc, char const *argv[]) {
   // Editline uses for example iswprint which is dependent on LC_CTYPE.
   // FIXME: this caused unexpected SIGTRAP on AIX
-#ifndef __AIX__
+#ifndef _AIX
   std::setlocale(LC_ALL, "");
   std::setlocale(LC_CTYPE, "");
 #endif
diff --git a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
index 91bb2083a88b5..52c2eae0c9033 100644
--- a/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
+++ b/lldb/tools/lldb-server/SystemInitializerLLGS.cpp
@@ -14,7 +14,7 @@ using HostObjectFile = ObjectFileMachO;
 #elif defined(_WIN32)
 #include "Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h"
 using HostObjectFile = ObjectFilePECOFF;
-#elif defined(__AIX__)
+#elif defined(_AIX)
 #include "Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h"
 using HostObjectFile = ObjectFileXCOFF;
 #else
@@ -49,7 +49,7 @@ using HostObjectFile = ObjectFileELF;
 #include "Plugins/Instruction/MIPS/EmulateInstructionMIPS.h"
 #endif
 
-#if defined(__AIX__)
+#if defined(_AIX)
 #include "Plugins/Instruction/PPC64/EmulateInstructionPPC64.h"
 #endif
 
@@ -82,7 +82,7 @@ llvm::Error SystemInitializerLLGS::Initialize() {
   EmulateInstructionRISCV::Initialize();
 #endif
 
-#if defined(__AIX__)
+#if defined(_AIX)
   EmulateInstructionPPC64::Initialize();
 #endif
 
@@ -108,7 +108,7 @@ void SystemInitializerLLGS::Terminate() {
   EmulateInstructionRISCV::Terminate();
 #endif
 
-#if defined(__AIX__)
+#if defined(_AIX)
   EmulateInstructionPPC64::Terminate();
 #endif
 
diff --git a/lldb/tools/lldb-server/lldb-gdbserver.cpp b/lldb/tools/lldb-server/lldb-gdbserver.cpp
index 844a6370bdb2e..dcbb421a73e25 100644
--- a/lldb/tools/lldb-server/lldb-gdbserver.cpp
+++ b/lldb/tools/lldb-server/lldb-gdbserver.cpp
@@ -45,7 +45,7 @@
 #include "Plugins/Process/NetBSD/NativeProcessNetBSD.h"
 #elif defined(_WIN32)
 #include "Plugins/Process/Windows/Common/NativeProcessWindows.h"
-#elif defined(__AIX__)
+#elif defined(_AIX)
 #include "Plugins/Process/AIX/NativeProcessAIX.h"
 #endif
 
@@ -72,7 +72,7 @@ typedef process_freebsd::NativeProcessFreeBSD::Manager NativeProcessManager;
 typedef process_netbsd::NativeProcessNetBSD::Manager NativeProcessManager;
 #elif defined(_WIN32)
 typedef NativeProcessWindows::Manager NativeProcessManager;
-#elif defined(__AIX__)
+#elif defined(_AIX)
 typedef process_aix::NativeProcessAIX::Manager NativeProcessManager;
 #else
 // Dummy implementation to make sure the code compiles
diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp
index c76476c947054..5c042261b9ef2 100644
--- a/lldb/unittests/Host/MainLoopTest.cpp
+++ b/lldb/unittests/Host/MainLoopTest.cpp
@@ -223,7 +223,7 @@ TEST_F(MainLoopTest, PendingCallbackAfterLoopExited) {
     loop.AddPendingCallback([&](MainLoopBase &loop) {});
 }
 
-#if defined(LLVM_ON_UNIX) && !defined(__AIX__)
+#if defined(LLVM_ON_UNIX) && !defined(_AIX)
 TEST_F(MainLoopTest, DetectsEOF) {
 
   PseudoTerminal term;
diff --git a/lldb/unittests/Host/PipeTest.cpp b/lldb/unittests/Host/PipeTest.cpp
index c1013aa7a7e4e..00ffd33d68f7a 100644
--- a/lldb/unittests/Host/PipeTest.cpp
+++ b/lldb/unittests/Host/PipeTest.cpp
@@ -55,7 +55,7 @@ TEST_F(PipeTest, OpenAsReader) {
 }
 #endif
 
-#if !defined(__AIX__)
+#if !defined(_AIX)
 TEST_F(PipeTest, WriteWithTimeout) {
   Pipe pipe;
   ASSERT_THAT_ERROR(pipe.CreateNew(false).ToError(), llvm::Succeeded());
diff --git a/lldb/unittests/Host/posix/TerminalTest.cpp b/lldb/unittests/Host/posix/TerminalTest.cpp
index f3de92c0852b1..64e6be64db80c 100644
--- a/lldb/unittests/Host/posix/TerminalTest.cpp
+++ b/lldb/unittests/Host/posix/TerminalTest.cpp
@@ -94,14 +94,14 @@ TEST_F(TerminalTest, SetRaw) {
 TEST_F(TerminalTest, SetBaudRate) {
   struct termios terminfo;
 
-#if (defined(__AIX__) && defined(B38400)) || !defined(__AIX__)
+#if (defined(_AIX) && defined(B38400)) || !defined(_AIX)
   ASSERT_THAT_ERROR(m_term.SetBaudRate(38400), llvm::Succeeded());
   ASSERT_EQ(tcgetattr(m_fd, &terminfo), 0);
   EXPECT_EQ(cfgetispeed(&terminfo), static_cast<speed_t>(B38400));
   EXPECT_EQ(cfgetospeed(&terminfo), static_cast<speed_t>(B38400));
 #endif
 
-#if (defined(__AIX__) && defined(B115200)) || !defined(__AIX__)
+#if (defined(_AIX) && defined(B115200)) || !defined(_AIX)
   ASSERT_THAT_ERROR(m_term.SetBaudRate(115200), llvm::Succeeded());
   ASSERT_EQ(tcgetattr(m_fd, &terminfo), 0);
   EXPECT_EQ(cfgetispeed(&terminfo), static_cast<speed_t>(B115200));

From a8020a6a8692f059679195ae1a0ef5e0eeee94c8 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 10 Oct 2024 04:52:08 -0500
Subject: [PATCH 11/58] Removed from lldb/CMakeLists

---
 lldb/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index a4fd8bccf056d..59cdc4593463c 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -38,10 +38,6 @@ endif()
 include(LLDBConfig)
 include(AddLLDB)
 
-if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-  add_definitions("-D_AIX")
-endif()
-
 # Define the LLDB_CONFIGURATION_xxx matching the build type.
 if(uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
   add_definitions(-DLLDB_CONFIGURATION_DEBUG)

From 7609ad339bfab48412221be54edc2d2d146279c3 Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Thu, 14 Nov 2024 13:23:59 -0600
Subject: [PATCH 12/58] Patch for the Merge conflict of xcoff first merge with
 llvm

---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 253 +++++++++++++++++-
 1 file changed, 252 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 395a126a01fce..a4d9ea295b4c3 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -188,7 +188,258 @@ bool ObjectFileXCOFF::ParseHeader() {
   if (module_sp) {
     std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
     m_sect_headers.clear();
-    lldb::offs
+    lldb::offset_t offset = 0;
+
+    if (ParseXCOFFHeader(m_data, &offset, m_xcoff_header)) {
+      m_data.SetAddressByteSize(GetAddressByteSize());
+      if (m_xcoff_header.auxhdrsize > 0)
+        ParseXCOFFOptionalHeader(m_data, &offset);
+      ParseSectionHeaders(offset);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFHeader(lldb_private::DataExtractor &data,
+                                       lldb::offset_t *offset_ptr,
+                                       xcoff_header_t &xcoff_header) {
+  //FIXME: data.ValidOffsetForDataOfSize
+  xcoff_header.magic = data.GetU16(offset_ptr);
+  xcoff_header.nsects = data.GetU16(offset_ptr);
+  xcoff_header.modtime = data.GetU32(offset_ptr);
+  xcoff_header.symoff = data.GetU64(offset_ptr);
+  xcoff_header.auxhdrsize = data.GetU16(offset_ptr);
+  xcoff_header.flags = data.GetU16(offset_ptr);
+  xcoff_header.nsyms = data.GetU32(offset_ptr);
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
+                                               lldb::offset_t *offset_ptr) {
+  lldb::offset_t init_offset = *offset_ptr;
+  //FIXME: data.ValidOffsetForDataOfSize
+  m_xcoff_aux_header.AuxMagic = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.Version = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ReservedForDebugger = data.GetU32(offset_ptr);
+  m_xcoff_aux_header.TextStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.DataStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.TOCAnchorAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfEntryPoint = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTOC = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfLoader = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ModuleType = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.CpuFlag = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.CpuType = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.DataPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.StackPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.FlagAndTDataAlignment = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.InitDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.BssDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.EntryPointAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxStackSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.XCOFF64Flag = data.GetU16(offset_ptr);
+  lldb::offset_t last_offset = *offset_ptr;
+  if ((last_offset - init_offset) < m_xcoff_header.auxhdrsize)
+    *offset_ptr += (m_xcoff_header.auxhdrsize - (last_offset - init_offset));
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseSectionHeaders(
+    uint32_t section_header_data_offset) {
+  const uint32_t nsects = m_xcoff_header.nsects;
+  m_sect_headers.clear();
+
+  if (nsects > 0) {
+    const size_t section_header_byte_size = nsects * m_binary->getSectionHeaderSize();
+    lldb_private::DataExtractor section_header_data =
+        ReadImageData(section_header_data_offset, section_header_byte_size);
+
+    lldb::offset_t offset = 0;
+    //FIXME: section_header_data.ValidOffsetForDataOfSize
+    m_sect_headers.resize(nsects);
+
+    for (uint32_t idx = 0; idx < nsects; ++idx) {
+      const void *name_data = section_header_data.GetData(&offset, 8);
+      if (name_data) {
+        memcpy(m_sect_headers[idx].name, name_data, 8);
+        m_sect_headers[idx].phyaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].vmaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].size = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].offset = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].reloff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].lineoff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].nreloc = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].nline = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].flags = section_header_data.GetU32(&offset);
+        offset += 4;
+      } else {
+        offset += (m_binary->getSectionHeaderSize() - 8);
+      }
+    }
+  }
+
+  return !m_sect_headers.empty();
+}
+
+lldb_private::DataExtractor ObjectFileXCOFF::ReadImageData(uint32_t offset, size_t size) {
+  if (!size)
+    return {};
+
+  if (m_data.ValidOffsetForDataOfSize(offset, size))
+    return lldb_private::DataExtractor(m_data, offset, size);
+
+  assert(0);
+  ProcessSP process_sp(m_process_wp.lock());
+  lldb_private::DataExtractor data;
+  if (process_sp) {
+    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
+    Status readmem_error;
+    size_t bytes_read =
+        process_sp->ReadMemory(offset, data_up->GetBytes(),
+                               data_up->GetByteSize(), readmem_error);
+    if (bytes_read == size) {
+      DataBufferSP buffer_sp(data_up.release());
+      data.SetData(buffer_sp, 0, buffer_sp->GetByteSize());
+    }
+  }
+  return data;
+}
+
+bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
+                                   bool value_is_offset) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (section_sp && !section_sp->IsThreadSpecific()) {
+          bool use_offset = false;
+          if (strcmp(section_sp->GetName().AsCString(), ".text") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".data") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".bss") == 0)
+            use_offset = true;
+
+          if (target.GetSectionLoadList().SetSectionLoadAddress(
+                  section_sp, (use_offset ?
+                  (section_sp->GetFileOffset() + value) : (section_sp->GetFileAddress() + value))))
+            ++num_loaded_sections;
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+bool ObjectFileXCOFF::SetLoadAddressByType(Target &target, lldb::addr_t value,
+                                   bool value_is_offset, int type_id) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (type_id == 1 && section_sp && strcmp(section_sp->GetName().AsCString(), ".text") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileOffset() + value))
+              ++num_loaded_sections;
+          }
+        } else if (type_id == 2 && section_sp && strcmp(section_sp->GetName().AsCString(), ".data") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileAddress() + value))
+              ++num_loaded_sections;
+          }
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+ByteOrder ObjectFileXCOFF::GetByteOrder() const {
+  return eByteOrderBig;
+}
+
+bool ObjectFileXCOFF::IsExecutable() const {
+  return true;
+}
+
+uint32_t ObjectFileXCOFF::GetAddressByteSize() const {
+  if (m_xcoff_header.magic == XCOFF::XCOFF64)
+    return 8;
+  else if (m_xcoff_header.magic == XCOFF::XCOFF32)
+    return 4;
+  return 4;
+}
+
+AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
+  return AddressClass::eUnknown;
+}
+
+lldb::SymbolType ObjectFileXCOFF::MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  if (sym_type == llvm::object::SymbolRef::ST_Function)
+    return lldb::eSymbolTypeCode;
+  else if (sym_type == llvm::object::SymbolRef::ST_Data)
+    return lldb::eSymbolTypeData;
+  return lldb::eSymbolTypeInvalid;
+}
+
+void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  SectionList *sect_list = GetSectionList();
+  const uint32_t num_syms = m_xcoff_header.nsyms;
+  uint32_t sidx = 0;
+  if (num_syms > 0 && m_xcoff_header.symoff > 0) {
+    const uint32_t symbol_size = XCOFF::SymbolTableEntrySize;
+    const size_t symbol_data_size = num_syms * symbol_size;
+    lldb_private::DataExtractor symtab_data =
+        ReadImageData(m_xcoff_header.symoff, symbol_data_size);
+
+    lldb::offset_t offset = 0;
+    std::string symbol_name;
+    Symbol *symbols = lldb_symtab.Resize(num_syms);
+    llvm::object::symbol_iterator SI = m_binary->symbol_begin();
+    for (uint32_t i = 0; i < num_syms; ++i, ++SI) {
+      xcoff_symbol_t symbol;
+      const uint32_t symbol_offset = offset;
+      symbol.value = symtab_data.GetU64(&offset);
+      symbol.offset = symtab_data.GetU32(&offset);
+      Expected<StringRef> symbol_name_or_err = m_binary->getStringTableEntry(symbol.offset);
+      if (!symbol_name_or_err) {
+        consumeError(symbol_name_or_err.takeError());
+        return;
+      }
+      StringRef symbol_name_str = symbol_name_or_err.get();
+      symbol_name.assign(symbol_name_str.data());
       symbol.sect = symtab_data.GetU16(&offset);
       symbol.type = symtab_data.GetU16(&offset);
       symbol.storage = symtab_data.GetU8(&offset);

From dd56fce276b60b40e1997292b3f554a20157661a Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sun, 17 Nov 2024 00:15:01 -0600
Subject: [PATCH 13/58] Attach fix for AIX

---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         | 130 ++++++++++++++++++
 .../AIX-DYLD/DynamicLoaderAIXDYLD.h           |   5 +-
 2 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 7f3a638d5b028..acaa6a72edded 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -18,8 +18,13 @@
 #include "lldb/Target/ThreadPlanStepInstruction.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
+#include "llvm/Support/FileSystem.h"
 #if defined(_AIX)
 #include <sys/ldr.h>
+#include <procinfo.h>
+#include <sys/procfs.h>
+#include <iostream>
+#include <fstream>
 #endif
 
 /*#include "llvm/ADT/Triple.h"
@@ -131,14 +136,139 @@ bool DynamicLoaderAIXDYLD::NotifyBreakpointHit(
     lldb::user_id_t break_loc_id) {
 }
 
+
+void DynamicLoaderAIXDYLD::ResolveExecutableModule(
+    lldb::ModuleSP &module_sp) {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  if (m_process == nullptr)
+    return;
+
+  auto &target = m_process->GetTarget();
+  const auto platform_sp = target.GetPlatform();
+
+  ProcessInstanceInfo process_info;
+  if (!m_process->GetProcessInfo(process_info)) {
+    LLDB_LOGF(log,
+              "DynamicLoaderPOSIXDYLD::%s - failed to get process info for "
+              "pid %" PRIu64,
+              __FUNCTION__, m_process->GetID());
+    return;
+  }
+
+  char procinfo_path[64], exe_path[PATH_MAX], arg_buffer[8192];
+  struct procsinfo64 procs_info;
+  int32long64_t pid = m_process->GetID();
+  std::string proc_file = "/proc/" + std::to_string(pid) + "/psinfo";
+  std::string cwd_link = "/proc/" + std::to_string(pid) + "/cwd";
+  psinfo_t psinfo;
+  std::ifstream file(proc_file, std::ios::binary);
+  if(!file.is_open())
+  {
+      LLDB_LOGF(log, "Error psinfo ");
+  }
+  file.read(reinterpret_cast<char*>(&psinfo), sizeof(psinfo_t));
+  if(!file)
+      LLDB_LOGF(log, "Error psinfo: Failed to read ");
+
+  std::string relative_path(psinfo.pr_fname);
+  LLDB_LOGF(log, "relative path %s",relative_path.c_str());
+
+  char cwd[PATH_MAX];
+  char resolved_path[PATH_MAX];
+  std::string executable_name;
+  bool found = 0;
+  if(readlink(cwd_link.c_str(), cwd, sizeof(cwd)) != -1){
+      std::filesystem::path full_path = std::filesystem::path(cwd)/relative_path; 
+  if(realpath(full_path.c_str(), resolved_path)) {
+      LLDB_LOGF(log, " RESOLVED PATH: %s", resolved_path);
+      found = 1;
+  }
+  else
+      perror("realpath error");}
+  
+  executable_name = resolved_path;
+  if(found == 0) {
+      std::string command_line(psinfo.pr_psargs);
+      LLDB_LOGF(log, "command line %s",command_line.c_str());
+      if (!command_line.empty()) {
+          size_t space1 = command_line.find(' ');
+          executable_name = command_line.substr(0, space1);
+          LLDB_LOGF(log, "executable name %s",executable_name.c_str());
+      } 
+  }
+
+  LLDB_LOGF(log, "executable name %s",executable_name.c_str());
+  /*target.SetExecutableModule(target.GetOrCreateModule(lldb_private::FileSpec(resolved_path),
+              true),true);*/
+  process_info.SetExecutableFile(lldb_private::FileSpec(executable_name),
+          true);
+ 
+/*  snprintf(procinfo_path, sizeof(procinfo_path), "/proc/%d/object/a.out", pid);
+  ssize_t len = readlink(procinfo_path, exe_path, sizeof(exe_path) - 1);
+  exe_path[len] = '\0';
+  int num_procs = getprocs64(&procs_info, sizeof(struct procsinfo64), NULL, 0, 
+          &pid,
+          1);
+  int result = getargs(pid, arg_buffer, sizeof(arg_buffer));
+  std::vector<std::string> args;
+  char *arg_start = arg_buffer;
+  while(*arg_start != '\0') {
+      args.emplace_back(arg_start);
+      arg_start += strlen(arg_start) + 1;
+  }
+
+  LLDB_LOGF(
+      log, "1. DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s"
+      ", pid: %d, current_path: %s",
+      __FUNCTION__, m_process->GetID(),
+       args[0], pid, current_path.c_str());
+  LLDB_LOGF(
+      log, "1. DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s"
+      "num_procs: %d, pid: %d",
+      __FUNCTION__, m_process->GetID(),
+      std::string(procs_info.pi_comm).c_str(), num_procs, pid);
+  if(num_procs <= 0)
+      perror("getprocs64 failed"); */
+  
+  LLDB_LOGF(
+      log, "DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s",
+      __FUNCTION__, m_process->GetID(),
+      process_info.GetExecutableFile().GetPath().c_str());
+
+  ModuleSpec module_spec(process_info.GetExecutableFile(),
+                         process_info.GetArchitecture());
+  if (module_sp && module_sp->MatchesModuleSpec(module_spec))
+    return;
+
+  const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths());
+  auto error = platform_sp->ResolveExecutable(
+      module_spec, module_sp,
+      !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr);
+  if (error.Fail()) {
+    StreamString stream;
+    module_spec.Dump(stream);
+
+    LLDB_LOGF(log,
+              "DynamicLoaderPOSIXDYLD::%s - failed to resolve executable "
+              "with module spec \"%s\": %s",
+              __FUNCTION__, stream.GetData(), error.AsCString());
+    return;
+  }
+
+  target.SetExecutableModule(module_sp, eLoadDependentsNo);
+}
+
 void DynamicLoaderAIXDYLD::DidAttach() {
   Log *log = GetLog(LLDBLog::DynamicLoader);
   LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   ModuleSP executable = GetTargetExecutable();
+  ResolveExecutableModule(executable);
 
   if (!executable.get())
     return;
+  LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   // Try to fetch the load address of the file from the process, since there
   // could be randomization of the load address.
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
index ae4b7aca66dcc..0ffbe688e0069 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
@@ -24,7 +24,7 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 
   static void Initialize();
   static void Terminate();
-  static llvm::StringRef GetPluginNameStatic() { return "windows-dyld"; }
+  static llvm::StringRef GetPluginNameStatic() { return "aix-dyld"; }
   static llvm::StringRef GetPluginDescriptionStatic();
 
   static DynamicLoader *CreateInstance(Process *process, bool force);
@@ -46,6 +46,9 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 protected:
   lldb::addr_t GetLoadAddress(lldb::ModuleSP executable);
 
+  /// Loads Module from inferior process.
+  void ResolveExecutableModule(lldb::ModuleSP &module_sp);
+
 private:
   std::map<lldb::ModuleSP, lldb::addr_t> m_loaded_modules;
 };

From 48b8b1b6532181acab0ee1710d5f4ab92903ae78 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 18 Nov 2024 02:56:31 -0600
Subject: [PATCH 14/58] Cleanup

---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         | 65 +++++--------------
 1 file changed, 17 insertions(+), 48 deletions(-)

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index acaa6a72edded..1a98bb9334043 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -156,81 +156,50 @@ void DynamicLoaderAIXDYLD::ResolveExecutableModule(
     return;
   }
 
-  char procinfo_path[64], exe_path[PATH_MAX], arg_buffer[8192];
-  struct procsinfo64 procs_info;
   int32long64_t pid = m_process->GetID();
+  char cwd[PATH_MAX], resolved_path[PATH_MAX];
+  std::string executable_name;
+  bool path_resolved = false;
+  psinfo_t psinfo;
+
   std::string proc_file = "/proc/" + std::to_string(pid) + "/psinfo";
   std::string cwd_link = "/proc/" + std::to_string(pid) + "/cwd";
-  psinfo_t psinfo;
   std::ifstream file(proc_file, std::ios::binary);
   if(!file.is_open())
-  {
-      LLDB_LOGF(log, "Error psinfo ");
-  }
+      LLDB_LOGF(log, "Error: Unable to access process info ");
+
   file.read(reinterpret_cast<char*>(&psinfo), sizeof(psinfo_t));
   if(!file)
-      LLDB_LOGF(log, "Error psinfo: Failed to read ");
+      LLDB_LOGF(log, "Process info error: Failed to read ");
 
   std::string relative_path(psinfo.pr_fname);
-  LLDB_LOGF(log, "relative path %s",relative_path.c_str());
+  LLDB_LOGF(log, "Relative path %s",relative_path.c_str());
 
-  char cwd[PATH_MAX];
-  char resolved_path[PATH_MAX];
-  std::string executable_name;
-  bool found = 0;
   if(readlink(cwd_link.c_str(), cwd, sizeof(cwd)) != -1){
       std::filesystem::path full_path = std::filesystem::path(cwd)/relative_path; 
   if(realpath(full_path.c_str(), resolved_path)) {
-      LLDB_LOGF(log, " RESOLVED PATH: %s", resolved_path);
-      found = 1;
+      LLDB_LOGF(log, "Resolved Path using process info : %s", resolved_path);
+      path_resolved = true;
   }
   else
-      perror("realpath error");}
+      LLDB_LOGF(log, "Realpath error: Unable to resolve. ");
+  }
   
   executable_name = resolved_path;
-  if(found == 0) {
+  if(path_resolved == false) {
       std::string command_line(psinfo.pr_psargs);
-      LLDB_LOGF(log, "command line %s",command_line.c_str());
+      LLDB_LOGF(log, "Command line: %s",command_line.c_str());
       if (!command_line.empty()) {
           size_t space1 = command_line.find(' ');
           executable_name = command_line.substr(0, space1);
-          LLDB_LOGF(log, "executable name %s",executable_name.c_str());
+          LLDB_LOGF(log, "Resolved path using command line arg %s",executable_name.c_str());
       } 
   }
 
-  LLDB_LOGF(log, "executable name %s",executable_name.c_str());
-  /*target.SetExecutableModule(target.GetOrCreateModule(lldb_private::FileSpec(resolved_path),
-              true),true);*/
+  LLDB_LOGF(log, "Executable Name %s",executable_name.c_str());
   process_info.SetExecutableFile(lldb_private::FileSpec(executable_name),
           true);
  
-/*  snprintf(procinfo_path, sizeof(procinfo_path), "/proc/%d/object/a.out", pid);
-  ssize_t len = readlink(procinfo_path, exe_path, sizeof(exe_path) - 1);
-  exe_path[len] = '\0';
-  int num_procs = getprocs64(&procs_info, sizeof(struct procsinfo64), NULL, 0, 
-          &pid,
-          1);
-  int result = getargs(pid, arg_buffer, sizeof(arg_buffer));
-  std::vector<std::string> args;
-  char *arg_start = arg_buffer;
-  while(*arg_start != '\0') {
-      args.emplace_back(arg_start);
-      arg_start += strlen(arg_start) + 1;
-  }
-
-  LLDB_LOGF(
-      log, "1. DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s"
-      ", pid: %d, current_path: %s",
-      __FUNCTION__, m_process->GetID(),
-       args[0], pid, current_path.c_str());
-  LLDB_LOGF(
-      log, "1. DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s"
-      "num_procs: %d, pid: %d",
-      __FUNCTION__, m_process->GetID(),
-      std::string(procs_info.pi_comm).c_str(), num_procs, pid);
-  if(num_procs <= 0)
-      perror("getprocs64 failed"); */
-  
   LLDB_LOGF(
       log, "DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s",
       __FUNCTION__, m_process->GetID(),

From d410734184a681b3e95949d3953142995682d7f6 Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Tue, 19 Nov 2024 09:44:42 -0600
Subject: [PATCH 15/58] Patch in MainLoopPosix.cpp for runtime issue

---
 lldb/source/Host/posix/MainLoopPosix.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index f68268f114075..4c617cdde67ba 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -149,7 +149,7 @@ Status MainLoopPosix::RunImpl::Poll() {
   int timeout;
 
   timeout = -1;
-  pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+  pthread_sigmask(SIG_SETMASK, nullptr, &origmask);
   int ready = poll(read_fds.data(), read_fds.size(), timeout);
   pthread_sigmask(SIG_SETMASK, &origmask, nullptr);
   if (ready == -1 && errno != EINTR)

From 48f39dadbbdb4874fbd9b6350933dc67e8823339 Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Thu, 5 Dec 2024 05:13:14 -0600
Subject: [PATCH 16/58] Patch for compilation failure in DomainSocket.cpp,
 AbstractSocket.cpp and AbstractSocket.h

---
 lldb/include/lldb/Host/aix/AbstractSocket.h | 2 +-
 lldb/source/Host/aix/AbstractSocket.cpp     | 3 +--
 lldb/source/Host/posix/DomainSocket.cpp     | 4 ++++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/AbstractSocket.h b/lldb/include/lldb/Host/aix/AbstractSocket.h
index 78a567a6b9095..accfd01457a5e 100644
--- a/lldb/include/lldb/Host/aix/AbstractSocket.h
+++ b/lldb/include/lldb/Host/aix/AbstractSocket.h
@@ -14,7 +14,7 @@
 namespace lldb_private {
 class AbstractSocket : public DomainSocket {
 public:
-  AbstractSocket(bool child_processes_inherit);
+  AbstractSocket();
 
 protected:
   size_t GetNameOffset() const override;
diff --git a/lldb/source/Host/aix/AbstractSocket.cpp b/lldb/source/Host/aix/AbstractSocket.cpp
index bfb67d452f7ec..fddf78f54f46d 100644
--- a/lldb/source/Host/aix/AbstractSocket.cpp
+++ b/lldb/source/Host/aix/AbstractSocket.cpp
@@ -13,8 +13,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-AbstractSocket::AbstractSocket(bool child_processes_inherit)
-    : DomainSocket(ProtocolUnixAbstract, child_processes_inherit) {}
+AbstractSocket::AbstractSocket() : DomainSocket(ProtocolUnixAbstract) {}
 
 size_t AbstractSocket::GetNameOffset() const { return 1; }
 
diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index 9a0b385d998bf..6cbffb2d9c4bd 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -17,6 +17,10 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
+#if defined(_AIX)
+#include <strings.h>
+#endif
+
 using namespace lldb;
 using namespace lldb_private;
 

From 97531f7bf6e385f0f51d860c6eea17aeb32f6594 Mon Sep 17 00:00:00 2001
From: Lakshmi-Surekha <Lakshmi.Kovvuri@ibm.com>
Date: Thu, 19 Dec 2024 06:38:36 -0600
Subject: [PATCH 17/58] Patch for merge conflict in ObjectFileXCOFF.cpp &
 ObjectFileXCOFF.h

---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 253 +++++++++++++++++-
 1 file changed, 252 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 395a126a01fce..a4d9ea295b4c3 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -188,7 +188,258 @@ bool ObjectFileXCOFF::ParseHeader() {
   if (module_sp) {
     std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
     m_sect_headers.clear();
-    lldb::offs
+    lldb::offset_t offset = 0;
+
+    if (ParseXCOFFHeader(m_data, &offset, m_xcoff_header)) {
+      m_data.SetAddressByteSize(GetAddressByteSize());
+      if (m_xcoff_header.auxhdrsize > 0)
+        ParseXCOFFOptionalHeader(m_data, &offset);
+      ParseSectionHeaders(offset);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFHeader(lldb_private::DataExtractor &data,
+                                       lldb::offset_t *offset_ptr,
+                                       xcoff_header_t &xcoff_header) {
+  //FIXME: data.ValidOffsetForDataOfSize
+  xcoff_header.magic = data.GetU16(offset_ptr);
+  xcoff_header.nsects = data.GetU16(offset_ptr);
+  xcoff_header.modtime = data.GetU32(offset_ptr);
+  xcoff_header.symoff = data.GetU64(offset_ptr);
+  xcoff_header.auxhdrsize = data.GetU16(offset_ptr);
+  xcoff_header.flags = data.GetU16(offset_ptr);
+  xcoff_header.nsyms = data.GetU32(offset_ptr);
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
+                                               lldb::offset_t *offset_ptr) {
+  lldb::offset_t init_offset = *offset_ptr;
+  //FIXME: data.ValidOffsetForDataOfSize
+  m_xcoff_aux_header.AuxMagic = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.Version = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ReservedForDebugger = data.GetU32(offset_ptr);
+  m_xcoff_aux_header.TextStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.DataStartAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.TOCAnchorAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfEntryPoint = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTOC = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfLoader = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfText = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.MaxAlignOfData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.ModuleType = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.CpuFlag = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.CpuType = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.DataPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.StackPageSize = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.FlagAndTDataAlignment = data.GetU8(offset_ptr);
+  m_xcoff_aux_header.TextSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.InitDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.BssDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.EntryPointAddr = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxStackSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.MaxDataSize = data.GetU64(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTData = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.SecNumOfTBSS = data.GetU16(offset_ptr);
+  m_xcoff_aux_header.XCOFF64Flag = data.GetU16(offset_ptr);
+  lldb::offset_t last_offset = *offset_ptr;
+  if ((last_offset - init_offset) < m_xcoff_header.auxhdrsize)
+    *offset_ptr += (m_xcoff_header.auxhdrsize - (last_offset - init_offset));
+  return true;
+}
+
+bool ObjectFileXCOFF::ParseSectionHeaders(
+    uint32_t section_header_data_offset) {
+  const uint32_t nsects = m_xcoff_header.nsects;
+  m_sect_headers.clear();
+
+  if (nsects > 0) {
+    const size_t section_header_byte_size = nsects * m_binary->getSectionHeaderSize();
+    lldb_private::DataExtractor section_header_data =
+        ReadImageData(section_header_data_offset, section_header_byte_size);
+
+    lldb::offset_t offset = 0;
+    //FIXME: section_header_data.ValidOffsetForDataOfSize
+    m_sect_headers.resize(nsects);
+
+    for (uint32_t idx = 0; idx < nsects; ++idx) {
+      const void *name_data = section_header_data.GetData(&offset, 8);
+      if (name_data) {
+        memcpy(m_sect_headers[idx].name, name_data, 8);
+        m_sect_headers[idx].phyaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].vmaddr = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].size = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].offset = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].reloff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].lineoff = section_header_data.GetU64(&offset);
+        m_sect_headers[idx].nreloc = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].nline = section_header_data.GetU32(&offset);
+        m_sect_headers[idx].flags = section_header_data.GetU32(&offset);
+        offset += 4;
+      } else {
+        offset += (m_binary->getSectionHeaderSize() - 8);
+      }
+    }
+  }
+
+  return !m_sect_headers.empty();
+}
+
+lldb_private::DataExtractor ObjectFileXCOFF::ReadImageData(uint32_t offset, size_t size) {
+  if (!size)
+    return {};
+
+  if (m_data.ValidOffsetForDataOfSize(offset, size))
+    return lldb_private::DataExtractor(m_data, offset, size);
+
+  assert(0);
+  ProcessSP process_sp(m_process_wp.lock());
+  lldb_private::DataExtractor data;
+  if (process_sp) {
+    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
+    Status readmem_error;
+    size_t bytes_read =
+        process_sp->ReadMemory(offset, data_up->GetBytes(),
+                               data_up->GetByteSize(), readmem_error);
+    if (bytes_read == size) {
+      DataBufferSP buffer_sp(data_up.release());
+      data.SetData(buffer_sp, 0, buffer_sp->GetByteSize());
+    }
+  }
+  return data;
+}
+
+bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
+                                   bool value_is_offset) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (section_sp && !section_sp->IsThreadSpecific()) {
+          bool use_offset = false;
+          if (strcmp(section_sp->GetName().AsCString(), ".text") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".data") == 0 ||
+              strcmp(section_sp->GetName().AsCString(), ".bss") == 0)
+            use_offset = true;
+
+          if (target.GetSectionLoadList().SetSectionLoadAddress(
+                  section_sp, (use_offset ?
+                  (section_sp->GetFileOffset() + value) : (section_sp->GetFileAddress() + value))))
+            ++num_loaded_sections;
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+bool ObjectFileXCOFF::SetLoadAddressByType(Target &target, lldb::addr_t value,
+                                   bool value_is_offset, int type_id) {
+  bool changed = false;
+  ModuleSP module_sp = GetModule();
+  if (module_sp) {
+    size_t num_loaded_sections = 0;
+    SectionList *section_list = GetSectionList();
+    if (section_list) {
+      const size_t num_sections = section_list->GetSize();
+      size_t sect_idx = 0;
+
+      for (sect_idx = 0; sect_idx < num_sections; ++sect_idx) {
+        // Iterate through the object file sections to find all of the sections
+        // that have SHF_ALLOC in their flag bits.
+        SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
+        if (type_id == 1 && section_sp && strcmp(section_sp->GetName().AsCString(), ".text") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileOffset() + value))
+              ++num_loaded_sections;
+          }
+        } else if (type_id == 2 && section_sp && strcmp(section_sp->GetName().AsCString(), ".data") == 0) {
+          if (!section_sp->IsThreadSpecific()) {
+            if (target.GetSectionLoadList().SetSectionLoadAddress(
+                    section_sp, section_sp->GetFileAddress() + value))
+              ++num_loaded_sections;
+          }
+        }
+      }
+      changed = num_loaded_sections > 0;
+    }
+  }
+  return changed;
+}
+
+ByteOrder ObjectFileXCOFF::GetByteOrder() const {
+  return eByteOrderBig;
+}
+
+bool ObjectFileXCOFF::IsExecutable() const {
+  return true;
+}
+
+uint32_t ObjectFileXCOFF::GetAddressByteSize() const {
+  if (m_xcoff_header.magic == XCOFF::XCOFF64)
+    return 8;
+  else if (m_xcoff_header.magic == XCOFF::XCOFF32)
+    return 4;
+  return 4;
+}
+
+AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
+  return AddressClass::eUnknown;
+}
+
+lldb::SymbolType ObjectFileXCOFF::MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  if (sym_type == llvm::object::SymbolRef::ST_Function)
+    return lldb::eSymbolTypeCode;
+  else if (sym_type == llvm::object::SymbolRef::ST_Data)
+    return lldb::eSymbolTypeData;
+  return lldb::eSymbolTypeInvalid;
+}
+
+void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  SectionList *sect_list = GetSectionList();
+  const uint32_t num_syms = m_xcoff_header.nsyms;
+  uint32_t sidx = 0;
+  if (num_syms > 0 && m_xcoff_header.symoff > 0) {
+    const uint32_t symbol_size = XCOFF::SymbolTableEntrySize;
+    const size_t symbol_data_size = num_syms * symbol_size;
+    lldb_private::DataExtractor symtab_data =
+        ReadImageData(m_xcoff_header.symoff, symbol_data_size);
+
+    lldb::offset_t offset = 0;
+    std::string symbol_name;
+    Symbol *symbols = lldb_symtab.Resize(num_syms);
+    llvm::object::symbol_iterator SI = m_binary->symbol_begin();
+    for (uint32_t i = 0; i < num_syms; ++i, ++SI) {
+      xcoff_symbol_t symbol;
+      const uint32_t symbol_offset = offset;
+      symbol.value = symtab_data.GetU64(&offset);
+      symbol.offset = symtab_data.GetU32(&offset);
+      Expected<StringRef> symbol_name_or_err = m_binary->getStringTableEntry(symbol.offset);
+      if (!symbol_name_or_err) {
+        consumeError(symbol_name_or_err.takeError());
+        return;
+      }
+      StringRef symbol_name_str = symbol_name_or_err.get();
+      symbol_name.assign(symbol_name_str.data());
       symbol.sect = symtab_data.GetU16(&offset);
       symbol.type = symtab_data.GetU16(&offset);
       symbol.storage = symtab_data.GetU8(&offset);

From 71d2fcff8975831e7f0a657481220749b0a473dc Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Tue, 24 Dec 2024 02:05:35 -0600
Subject: [PATCH 18/58] Added upcoming clang-format and other merge changes

---
 .../posix/ConnectionFileDescriptorPosix.cpp   |  9 ++--
 lldb/source/Host/posix/DomainSocket.cpp       |  5 ++-
 lldb/source/Host/posix/FileSystemPosix.cpp    |  2 +-
 lldb/source/Host/posix/MainLoopPosix.cpp      | 43 ++++++++-----------
 .../Host/posix/ProcessLauncherPosixFork.cpp   |  2 +-
 lldb/source/Plugins/Language/ObjC/Cocoa.cpp   | 17 +++-----
 .../BSD-Archive/ObjectContainerBSDArchive.cpp | 29 +++++++------
 lldb/source/Utility/ArchSpec.cpp              |  1 -
 8 files changed, 49 insertions(+), 59 deletions(-)

diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 32d034e60d26c..e3d1300cf76ed 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -119,8 +119,7 @@ bool ConnectionFileDescriptor::IsConnected() const {
 
 ConnectionStatus ConnectionFileDescriptor::Connect(llvm::StringRef path,
                                                    Status *error_ptr) {
-  return Connect(
-      path, [](llvm::StringRef) {}, error_ptr);
+  return Connect(path, [](llvm::StringRef) {}, error_ptr);
 }
 
 ConnectionStatus
@@ -716,8 +715,7 @@ ConnectionFileDescriptor::ConnectFD(llvm::StringRef s,
 ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     llvm::StringRef s, socket_id_callback_type socket_id_callback,
     Status *error_ptr) {
-#if !defined(_AIX)
-#if LLDB_ENABLE_POSIX
+#if LLDB_ENABLE_POSIX && !defined(_AIX)
   std::string addr_str = s.str();
   // file:///PATH
   int fd = FileSystem::Instance().Open(addr_str.c_str(), O_RDWR);
@@ -748,8 +746,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectFile(
 
   m_io_sp = std::make_shared<NativeFile>(fd, File::eOpenOptionReadWrite, true);
   return eConnectionStatusSuccess;
-#endif // LLDB_ENABLE_POSIX
-#endif
+#endif // LLDB_ENABLE_POSIX && !defined(_AIX)
   llvm_unreachable("this function should be only called w/ LLDB_ENABLE_POSIX");
 }
 
diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index 6cbffb2d9c4bd..28db5964a5a8a 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -89,8 +89,9 @@ Status DomainSocket::Connect(llvm::StringRef name) {
   m_socket = CreateSocket(kDomain, kType, 0, error);
   if (error.Fail())
     return error;
-  if (llvm::sys::RetryAfterSignal(-1, ::connect, GetNativeSocket(),
-        (struct sockaddr *)&saddr_un, saddr_un_len) < 0)
+   if (llvm::sys::RetryAfterSignal(-1, ::connect, GetNativeSocket(),
+                                  (struct sockaddr *)&saddr_un,
+                                  saddr_un_len) < 0)
     SetLastError(error);
 
   return error;
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 21da5612ff6b8..1a84f550662d7 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,7 +11,7 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
-#if !defined(_AIX)
+#ifndef _AIX
 #include <sys/mount.h>
 #endif
 #include <sys/param.h>
diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index 125b954023dc0..e4ff928a58962 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -99,6 +99,7 @@ class MainLoopPosix::RunImpl {
   ~RunImpl() = default;
 
   Status Poll();
+  int StartPoll(std::optional<MainLoopPosix::TimePoint> point);
   void ProcessReadEvents();
 
 private:
@@ -159,6 +160,22 @@ MainLoopPosix::RunImpl::RunImpl(MainLoopPosix &loop) : loop(loop) {
   read_fds.reserve(loop.m_read_fds.size());
 }
 
+int MainLoopPosix::RunImpl::StartPoll(
+    std::optional<MainLoopPosix::TimePoint> point) {
+#if HAVE_PPOLL
+      return ppoll(read_fds.data(), read_fds.size(), ToTimeSpec(point),
+               /*sigmask=*/nullptr);
+#else
+  using namespace std::chrono;
+  int timeout = -1;
+  if (point) {
+    nanoseconds dur = std::max(*point - steady_clock::now(), nanoseconds(0));
+    timeout = ceil<milliseconds>(dur).count();
+  }
+  return poll(read_fds.data(), read_fds.size(), timeout);
+#endif
+}
+
 Status MainLoopPosix::RunImpl::Poll() {
   read_fds.clear();
 
@@ -169,24 +186,10 @@ Status MainLoopPosix::RunImpl::Poll() {
     pfd.revents = 0;
     read_fds.push_back(pfd);
   }
+  int ready = StartPoll(loop.GetNextWakeupTime());
 
-#if defined(_AIX)
-  sigset_t origmask;
-  int timeout;
-
-  timeout = -1;
-  pthread_sigmask(SIG_SETMASK, nullptr, &origmask);
-  int ready = poll(read_fds.data(), read_fds.size(), timeout);
-  pthread_sigmask(SIG_SETMASK, &origmask, nullptr);
   if (ready == -1 && errno != EINTR)
     return Status(errno, eErrorTypePOSIX);
-#else
-  if (ppoll(read_fds.data(), read_fds.size(),
-            ToTimeSpec(loop.GetNextWakeupTime()),
-            /*sigmask=*/nullptr) == -1 &&
-      errno != EINTR)
-    return Status(errno, eErrorTypePOSIX);
-#endif
 
   return Status();
 }
@@ -291,16 +294,6 @@ MainLoopPosix::RegisterSignal(int signo, const Callback &callback,
   UNUSED_IF_ASSERT_DISABLED(ret);
   assert(ret == 0 && "sigaction failed");
 
-#if HAVE_SYS_EVENT_H
-  struct kevent ev;
-  EV_SET(&ev, signo, EVFILT_SIGNAL, EV_ADD, 0, 0, 0);
-  ret = kevent(m_kqueue, &ev, 1, nullptr, 0, nullptr);
-  assert(ret == 0);
-#endif
-
-  // If we're using kqueue, the signal needs to be unblocked in order to
-  // receive it. If using pselect/ppoll, we need to block it, and later unblock
-  // it as a part of the system call.
   ret = pthread_sigmask(SIG_UNBLOCK, &new_action.sa_mask, &old_set);
   assert(ret == 0 && "pthread_sigmask failed");
   info.was_blocked = sigismember(&old_set, signo);
diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index 52fc58aa21bf4..7b8b42a4b7fe0 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -197,7 +197,7 @@ struct ForkLaunchInfo {
 #else
     if (ptrace(PT_TRACE_ME, 0, nullptr, 0) == -1)
 #endif
-        ExitWithError(error_fd, "ptrace");
+      ExitWithError(error_fd, "ptrace");
   }
 
   // Execute.  We should never return...
diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
index 1d841a032aa6e..1d79edbede5d6 100644
--- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
+++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp
@@ -31,7 +31,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/bit.h"
 
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::formatters;
@@ -267,21 +266,21 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
     if (class_name == "NSIndexSet" || class_name == "NSMutableIndexSet") {
       // Foundation version 2000 added a bitmask if the index set fit in 64 bits
       // and a Tagged Pointer version if the bitmask is small enough to fit in
-      // the tagged pointer payload.  
+      // the tagged pointer payload.
       // It also changed the layout (but not the size) of the set descriptor.
 
       // First check whether this is a tagged pointer.  The bitmask will be in
       // the payload of the tagged pointer.
       uint64_t payload;
-      if (runtime->GetFoundationVersion() >= 2000  
-          && descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) {
+      if (runtime->GetFoundationVersion() >= 2000 &&
+          descriptor->GetTaggedPointerInfo(nullptr, nullptr, &payload)) {
         count = llvm::popcount(payload);
         break;
       }
       // The first 32 bits describe the index set in all cases:
       Status error;
       uint32_t mode = process_sp->ReadUnsignedIntegerFromMemory(
-            valobj_addr + ptr_size, 4, 0, error);
+          valobj_addr + ptr_size, 4, 0, error);
       if (error.Fail())
         return false;
       // Now check if the index is held in a bitmask in the object:
@@ -292,7 +291,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
         if ((mode & 2) == 2) {
           // The bitfield is a 64 bit uint at the beginning of the data var.
           uint64_t bitfield = process_sp->ReadUnsignedIntegerFromMemory(
-            valobj_addr + 2 * ptr_size, 8, 0, error);
+              valobj_addr + 2 * ptr_size, 8, 0, error);
           if (error.Fail())
             return false;
           count = llvm::popcount(bitfield);
@@ -309,7 +308,7 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider(
           count = 0;
           break;
         }
-      
+
         if ((mode & 2) == 2)
           mode = 1; // this means the set only has one range
         else
@@ -1227,8 +1226,7 @@ bool lldb_private::formatters::ObjCSELSummaryProvider(
 time_t lldb_private::formatters::GetOSXEpoch() {
   static time_t epoch = 0;
   if (!epoch) {
-#if !defined(_AIX)
-#ifndef _WIN32
+#if !defined(_WIN32) && !defined(_AIX)
     tzset();
     tm tm_epoch;
     tm_epoch.tm_sec = 0;
@@ -1241,7 +1239,6 @@ time_t lldb_private::formatters::GetOSXEpoch() {
     tm_epoch.tm_gmtoff = 0;
     tm_epoch.tm_zone = nullptr;
     epoch = timegm(&tm_epoch);
-#endif
 #endif
   }
   return epoch;
diff --git a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
index 4f747ab20c9ef..b202898ff438a 100644
--- a/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
+++ b/lldb/source/Plugins/ObjectContainer/BSD-Archive/ObjectContainerBSDArchive.cpp
@@ -81,10 +81,10 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
 
   std::unique_ptr<llvm::MemoryBuffer> mem_buffer =
       llvm::MemoryBuffer::getMemBuffer(
-            llvm::StringRef((const char *)data.GetDataStart(),
-                            data.GetByteSize()),
-            llvm::StringRef(),
-            /*RequiresNullTerminator=*/false);
+          llvm::StringRef((const char *)data.GetDataStart(),
+                          data.GetByteSize()),
+          llvm::StringRef(),
+          /*RequiresNullTerminator=*/false);
 
   auto exp_ar = llvm::object::Archive::create(mem_buffer->getMemBufferRef());
   if (!exp_ar) {
@@ -95,7 +95,7 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
 
   llvm::Error iter_err = llvm::Error::success();
   Object obj;
-  for (const auto &child: llvm_archive->children(iter_err)) {
+  for (const auto &child : llvm_archive->children(iter_err)) {
     obj.Clear();
     auto exp_name = child.getName();
     if (exp_name) {
@@ -111,7 +111,9 @@ size_t ObjectContainerBSDArchive::Archive::ParseObjects() {
       obj.modification_time =
           std::chrono::duration_cast<std::chrono::seconds>(
               std::chrono::time_point_cast<std::chrono::seconds>(
-                    exp_mtime.get()).time_since_epoch()).count();
+                  exp_mtime.get())
+                  .time_since_epoch())
+              .count();
     } else {
       LLDB_LOG_ERROR(l, exp_mtime.takeError(),
                      "failed to get archive object time: {0}");
@@ -331,21 +333,21 @@ ObjectContainer *ObjectContainerBSDArchive::CreateInstance(
 ArchiveType
 ObjectContainerBSDArchive::MagicBytesMatch(const DataExtractor &data) {
   uint32_t offset = 0;
-  const char *armag = (const char *)data.PeekData(offset,
-                                                  sizeof(ar_hdr) + SARMAG);
+  const char *armag =
+      (const char *)data.PeekData(offset, sizeof(ar_hdr) + SARMAG);
   if (armag == nullptr)
     return ArchiveType::Invalid;
   ArchiveType result = ArchiveType::Invalid;
   if (strncmp(armag, ArchiveMagic, SARMAG) == 0)
-      result = ArchiveType::Archive;
+    result = ArchiveType::Archive;
   else if (strncmp(armag, ThinArchiveMagic, SARMAG) == 0)
-      result = ArchiveType::ThinArchive;
+    result = ArchiveType::ThinArchive;
   else
-      return ArchiveType::Invalid;
+    return ArchiveType::Invalid;
 
   armag += offsetof(struct ar_hdr, ar_fmag) + SARMAG;
   if (strncmp(armag, ARFMAG, 2) == 0)
-      return result;
+    return result;
   return ArchiveType::Invalid;
 }
 
@@ -443,7 +445,8 @@ size_t ObjectContainerBSDArchive::GetModuleSpecifications(
     return 0;
 
   const size_t initial_count = specs.GetSize();
-  llvm::sys::TimePoint<> file_mod_time = FileSystem::Instance().GetModificationTime(file);
+  llvm::sys::TimePoint<> file_mod_time =
+      FileSystem::Instance().GetModificationTime(file);
   Archive::shared_ptr archive_sp(
       Archive::FindCachedArchive(file, ArchSpec(), file_mod_time, file_offset));
   bool set_archive_arch = false;
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index ac91183a271cc..85bb85044ec15 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -14,7 +14,6 @@
 #include "lldb/lldb-defines.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/COFF.h"
-#include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/BinaryFormat/XCOFF.h"

From 8fcf69ed77148f8b339b87f75ed97e5ce719b4ba Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 27 Dec 2024 06:50:27 -0600
Subject: [PATCH 19/58] Some Updates

---
 lldb/include/lldb/Host/aix/HostInfoAIX.h      | 11 +--
 lldb/source/Host/aix/HostInfoAIX.cpp          | 65 +-------------
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 87 +++++++++----------
 .../ObjectFile/XCOFF/ObjectFileXCOFF.h        | 11 ++-
 4 files changed, 50 insertions(+), 124 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
index ced4cf34d38a8..ba727e1d5f171 100644
--- a/lldb/include/lldb/Host/aix/HostInfoAIX.h
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -6,16 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef lldb_Host_aix_HostInfoAIX_h_
-#define lldb_Host_aix_HostInfoAIX_h_
+#ifndef LLDB_HOST_AIX_HOSTINFOAIX_H_
+#define LLDB_HOST_AIX_HOSTINFOAIX_H_
 
 #include "lldb/Host/posix/HostInfoPosix.h"
 #include "lldb/Utility/FileSpec.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/VersionTuple.h"
 
-#include <string>
-
 namespace lldb_private {
 
 class HostInfoAIX : public HostInfoPosix {
@@ -25,15 +23,10 @@ class HostInfoAIX : public HostInfoPosix {
   static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
   static void Terminate();
 
-  static llvm::VersionTuple GetOSVersion();
-  static std::optional<std::string> GetOSBuildString();
   static llvm::StringRef GetDistributionId();
   static FileSpec GetProgramFileSpec();
 
 protected:
-  static bool ComputeSupportExeDirectory(FileSpec &file_spec);
-  static bool ComputeSystemPluginsDirectory(FileSpec &file_spec);
-  static bool ComputeUserPluginsDirectory(FileSpec &file_spec);
   static void ComputeHostArchitectureSupport(ArchSpec &arch_32,
                                              ArchSpec &arch_64);
 };
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
index 8bda09e01741b..ef07b07c8cab2 100644
--- a/lldb/source/Host/aix/HostInfoAIX.cpp
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -29,8 +29,6 @@ namespace {
 struct HostInfoAIXFields {
   llvm::once_flag m_distribution_once_flag;
   std::string m_distribution_id;
-  llvm::once_flag m_os_version_once_flag;
-  llvm::VersionTuple m_os_version;
 };
 } // namespace
 
@@ -49,33 +47,6 @@ void HostInfoAIX::Terminate() {
   HostInfoBase::Terminate();
 }
 
-llvm::VersionTuple HostInfoAIX::GetOSVersion() {
-  assert(g_fields && "Missing call to Initialize?");
-  llvm::call_once(g_fields->m_os_version_once_flag, []() {
-    struct utsname un;
-    if (uname(&un) != 0)
-      return;
-
-    llvm::StringRef release = un.release;
-    // The kernel release string can include a lot of stuff (e.g.
-    // 4.9.0-6-amd64). We're only interested in the numbered prefix.
-    release = release.substr(0, release.find_first_not_of("0123456789."));
-    g_fields->m_os_version.tryParse(release);
-  });
-
-  return g_fields->m_os_version;
-}
-
-std::optional<std::string> HostInfoAIX::GetOSBuildString() {
-  struct utsname un;
-  ::memset(&un, 0, sizeof(utsname));
-
-  if (uname(&un) < 0)
-    return std::nullopt;
-
-  return std::string(un.release);
-}
-
 llvm::StringRef HostInfoAIX::GetDistributionId() {
   assert(g_fields && "Missing call to Initialize?");
   // Try to run 'lbs_release -i', and use that response for the distribution
@@ -122,8 +93,7 @@ llvm::StringRef HostInfoAIX::GetDistributionId() {
         if (strstr(distribution_id, distributor_id_key)) {
           // strip newlines
           std::string id_string(distribution_id + strlen(distributor_id_key));
-          id_string.erase(std::remove(id_string.begin(), id_string.end(), '\n'),
-                          id_string.end());
+          llvm::erase(id_string, '\n');
 
           // lower case it and convert whitespace to underscores
           std::transform(
@@ -167,42 +137,11 @@ FileSpec HostInfoAIX::GetProgramFileSpec() {
   return g_program_filespec;
 }
 
-bool HostInfoAIX::ComputeSupportExeDirectory(FileSpec &file_spec) {
-  if (HostInfoPosix::ComputeSupportExeDirectory(file_spec) &&
-      file_spec.IsAbsolute() && FileSystem::Instance().Exists(file_spec))
-    return true;
-  file_spec.SetDirectory(GetProgramFileSpec().GetDirectory());
-  return !file_spec.GetDirectory().IsEmpty();
-}
-
-bool HostInfoAIX::ComputeSystemPluginsDirectory(FileSpec &file_spec) {
-  FileSpec temp_file("/usr/" LLDB_INSTALL_LIBDIR_BASENAME "/lldb/plugins");
-  FileSystem::Instance().Resolve(temp_file);
-  file_spec.SetDirectory(temp_file.GetPath());
-  return true;
-}
-
-bool HostInfoAIX::ComputeUserPluginsDirectory(FileSpec &file_spec) {
-  // XDG Base Directory Specification
-  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html If
-  // XDG_DATA_HOME exists, use that, otherwise use ~/.local/share/lldb.
-  const char *xdg_data_home = getenv("XDG_DATA_HOME");
-  if (xdg_data_home && xdg_data_home[0]) {
-    std::string user_plugin_dir(xdg_data_home);
-    user_plugin_dir += "/lldb";
-    file_spec.SetDirectory(user_plugin_dir.c_str());
-  } else
-    file_spec.SetDirectory("~/.local/share/lldb");
-  return true;
-}
-
 void HostInfoAIX::ComputeHostArchitectureSupport(ArchSpec &arch_32,
                                                    ArchSpec &arch_64) {
   HostInfoPosix::ComputeHostArchitectureSupport(arch_32, arch_64);
 
-  const char *distribution_id = GetDistributionId().data();
-
-  // On Linux, "unknown" in the vendor slot isn't what we want for the default
+  // "unknown" in the vendor slot isn't what we want for the default
   // triple.  It's probably an artifact of config.guess.
   if (arch_32.IsValid()) {
     if (arch_32.GetTriple().getVendor() == llvm::Triple::UnknownVendor)
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index a4d9ea295b4c3..afd8027bab06c 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -1,4 +1,5 @@
-//===-- ObjectFileXCOFF.cpp -------------------------------------------------===//
+//===-- ObjectFileXCOFF.cpp
+//-------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,13 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ObjectFileXCOFF.h"
-
-#include <algorithm>
-#include <cassert>
-#include <unordered_map>
-#include <string.h>
-
-#include "lldb/Utility/FileSpecList.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
@@ -28,6 +22,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/FileSpecList.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/RangeMap.h"
@@ -38,12 +33,16 @@
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/Object/XCOFFObjectFile.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Object/XCOFFObjectFile.h"
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <unordered_map>
 
 using namespace llvm;
 using namespace lldb;
@@ -69,21 +68,19 @@ void ObjectFileXCOFF::Terminate() {
 bool UGLY_FLAG_FOR_AIX __attribute__((weak)) = false;
 
 ObjectFile *ObjectFileXCOFF::CreateInstance(const lldb::ModuleSP &module_sp,
-                                          DataBufferSP data_sp,
-                                          lldb::offset_t data_offset,
-                                          const lldb_private::FileSpec *file,
-                                          lldb::offset_t file_offset,
-                                          lldb::offset_t length) {
+                                            DataBufferSP data_sp,
+                                            lldb::offset_t data_offset,
+                                            const lldb_private::FileSpec *file,
+                                            lldb::offset_t file_offset,
+                                            lldb::offset_t length) {
   if (!data_sp) {
     data_sp = MapFileData(*file, length, file_offset);
     if (!data_sp)
       return nullptr;
     data_offset = 0;
   }
-
   if (!ObjectFileXCOFF::MagicBytesMatch(data_sp, data_offset, length))
     return nullptr;
-
   // Update the data to contain the entire file if it doesn't already
   if (data_sp->GetByteSize() < length) {
     data_sp = MapFileData(*file, length, file_offset);
@@ -114,15 +111,15 @@ bool ObjectFileXCOFF::CreateBinary() {
 
   Log *log = GetLog(LLDBLog::Object);
 
-  auto binary = llvm::object::XCOFFObjectFile::createObjectFile(llvm::MemoryBufferRef(
-      toStringRef(m_data.GetData()), m_file.GetFilename().GetStringRef()),
-    file_magic::xcoff_object_64);
+  auto binary = llvm::object::ObjectFile::createObjectFile(
+      llvm::MemoryBufferRef(toStringRef(m_data.GetData()),
+                            m_file.GetFilename().GetStringRef()),
+      file_magic::xcoff_object_64);
   if (!binary) {
     LLDB_LOG_ERROR(log, binary.takeError(),
                    "Failed to create binary for file ({1}): {0}", m_file);
     return false;
   }
-
   // Make sure we only handle COFF format.
   m_binary =
       llvm::unique_dyn_cast<llvm::object::XCOFFObjectFile>(std::move(*binary));
@@ -132,6 +129,7 @@ bool ObjectFileXCOFF::CreateBinary() {
   LLDB_LOG(log, "this = {0}, module = {1} ({2}), file = {3}, binary = {4}",
            this, GetModule().get(), GetModule()->GetSpecificationDescription(),
            m_file.GetPath(), m_binary.get());
+
   return true;
 }
 
@@ -148,9 +146,12 @@ size_t ObjectFileXCOFF::GetModuleSpecifications(
   const size_t initial_count = specs.GetSize();
 
   if (ObjectFileXCOFF::MagicBytesMatch(data_sp, 0, data_sp->GetByteSize())) {
-    ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+    ArchSpec arch_spec =
+        ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
     ModuleSpec spec(file, arch_spec);
-    spec.GetArchitecture().SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
+    spec.GetArchitecture().SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64,
+                                           LLDB_INVALID_CPUTYPE,
+                                           llvm::Triple::AIX);
     specs.Append(spec);
   }
   return specs.GetSize() - initial_count;
@@ -158,11 +159,9 @@ size_t ObjectFileXCOFF::GetModuleSpecifications(
 
 static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) {
   switch (magic) {
-  /* TODO: 32bit not supported yet
-  case XCOFF::XCOFF32:
-    return sizeof(struct llvm::object::XCOFFFileHeader32);
-  */
-
+    // TODO: 32bit not supported.
+    // case XCOFF::XCOFF32:
+    //  return sizeof(struct llvm::object::XCOFFFileHeader32);
   case XCOFF::XCOFF64:
     return sizeof(struct llvm::object::XCOFFFileHeader64);
     break;
@@ -174,10 +173,12 @@ static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) {
 }
 
 bool ObjectFileXCOFF::MagicBytesMatch(DataBufferSP &data_sp,
-                                    lldb::addr_t data_offset,
-                                    lldb::addr_t data_length) {
-  lldb_private::DataExtractor data; 
+                                      lldb::addr_t data_offset,
+                                      lldb::addr_t data_length) {
+  lldb_private::DataExtractor data;
   data.SetData(data_sp, data_offset, data_length);
+  // Need to set this as XCOFF is only compatible with Big Endian
+  data.SetByteOrder(eByteOrderBig);
   lldb::offset_t offset = 0;
   uint16_t magic = data.GetU16(&offset);
   return XCOFFHeaderSizeFromMagic(magic) != 0;
@@ -386,13 +387,10 @@ bool ObjectFileXCOFF::SetLoadAddressByType(Target &target, lldb::addr_t value,
   return changed;
 }
 
-ByteOrder ObjectFileXCOFF::GetByteOrder() const {
-  return eByteOrderBig;
-}
 
-bool ObjectFileXCOFF::IsExecutable() const {
-  return true;
-}
+ByteOrder ObjectFileXCOFF::GetByteOrder() const { return eByteOrderBig; }
+
+bool ObjectFileXCOFF::IsExecutable() const { return true; }
 
 uint32_t ObjectFileXCOFF::GetAddressByteSize() const {
   if (m_xcoff_header.magic == XCOFF::XCOFF64)
@@ -592,13 +590,12 @@ void ObjectFileXCOFF::Dump(Stream *s) {
 }
 
 ArchSpec ObjectFileXCOFF::GetArchitecture() {
-  ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+  ArchSpec arch_spec =
+      ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
   return arch_spec;
 }
 
-UUID ObjectFileXCOFF::GetUUID() {
-  return UUID();
-}
+UUID ObjectFileXCOFF::GetUUID() { return UUID(); }
 
 std::optional<FileSpec> ObjectFileXCOFF::GetDebugLink() {
   return std::nullopt;
@@ -724,16 +721,14 @@ lldb_private::Address ObjectFileXCOFF::GetBaseAddress() {
 }
 
 ObjectFile::Type ObjectFileXCOFF::CalculateType() {
-  if (m_xcoff_header.flags & XCOFF::F_EXEC)
+  if (m_binary->fileHeader64()->Flags & XCOFF::F_EXEC)
     return eTypeExecutable;
-  else if (m_xcoff_header.flags & XCOFF::F_SHROBJ)
+  else if (m_binary->fileHeader64()->Flags & XCOFF::F_SHROBJ)
     return eTypeSharedLibrary;
   return eTypeUnknown;
 }
 
-ObjectFile::Strata ObjectFileXCOFF::CalculateStrata() {
-  return eStrataUnknown;
-}
+ObjectFile::Strata ObjectFileXCOFF::CalculateStrata() { return eStrataUnknown; }
 
 llvm::StringRef
 ObjectFileXCOFF::StripLinkerSymbolAnnotations(llvm::StringRef symbol_name) const {
@@ -752,7 +747,7 @@ ObjectFileXCOFF::GetLoadableData(Target &target) {
 
 lldb::WritableDataBufferSP
 ObjectFileXCOFF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
-                                   uint64_t Offset) {
+                                     uint64_t Offset) {
   return FileSystem::Instance().CreateWritableDataBuffer(file.GetPath(), Size,
                                                          Offset);
 }
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
index 5a12d16886489..f827fca3932f4 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
@@ -1,4 +1,5 @@
-//===-- ObjectFileXCOFF.h --------------------------------------- -*- C++ -*-===//
+//===-- ObjectFileXCOFF.h --------------------------------------- -*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,16 +10,14 @@
 #ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILEXCOFF_H
 #define LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILEXCOFF_H
 
-#include <cstdint>
-
-#include <vector>
-
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/UUID.h"
 #include "lldb/lldb-private.h"
 #include "llvm/Object/XCOFFObjectFile.h"
+#include <cstdint>
+#include <vector>
 
 /// \class ObjectFileXCOFF
 /// Generic XCOFF object file reader.
@@ -240,4 +239,4 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile {
   std::map<std::string, std::vector<std::string>> m_deps_base_members;
 };
 
-#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
+#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILE_H

From f8b05dfc9fc75177a63dfa2d6df4a9af143b09b8 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sun, 5 Jan 2025 05:01:47 -0600
Subject: [PATCH 20/58] HostInfoAIX Cleanup

---
 lldb/include/lldb/Host/aix/HostInfoAIX.h |   4 -
 lldb/source/Host/aix/HostInfoAIX.cpp     | 108 -----------------------
 2 files changed, 112 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
index ba727e1d5f171..5a52c42fa6199 100644
--- a/lldb/include/lldb/Host/aix/HostInfoAIX.h
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -23,12 +23,8 @@ class HostInfoAIX : public HostInfoPosix {
   static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
   static void Terminate();
 
-  static llvm::StringRef GetDistributionId();
   static FileSpec GetProgramFileSpec();
 
-protected:
-  static void ComputeHostArchitectureSupport(ArchSpec &arch_32,
-                                             ArchSpec &arch_64);
 };
 }
 
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
index ef07b07c8cab2..2996fcb55f811 100644
--- a/lldb/source/Host/aix/HostInfoAIX.cpp
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -11,117 +11,25 @@
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
-
 #include "llvm/Support/Threading.h"
-
 #include <climits>
 #include <cstdio>
 #include <cstring>
 #include <sys/utsname.h>
 #include <unistd.h>
-
 #include <algorithm>
 #include <mutex>
 
 using namespace lldb_private;
 
-namespace {
-struct HostInfoAIXFields {
-  llvm::once_flag m_distribution_once_flag;
-  std::string m_distribution_id;
-};
-} // namespace
-
-static HostInfoAIXFields *g_fields = nullptr;
-
 void HostInfoAIX::Initialize(SharedLibraryDirectoryHelper *helper) {
   HostInfoPosix::Initialize(helper);
-
-  g_fields = new HostInfoAIXFields();
 }
 
 void HostInfoAIX::Terminate() {
-  assert(g_fields && "Missing call to Initialize?");
-  delete g_fields;
-  g_fields = nullptr;
   HostInfoBase::Terminate();
 }
 
-llvm::StringRef HostInfoAIX::GetDistributionId() {
-  assert(g_fields && "Missing call to Initialize?");
-  // Try to run 'lbs_release -i', and use that response for the distribution
-  // id.
-  llvm::call_once(g_fields->m_distribution_once_flag, []() {
-    Log *log = GetLog(LLDBLog::Host);
-    LLDB_LOGF(log, "attempting to determine AIX distribution...");
-
-    // check if the lsb_release command exists at one of the following paths
-    const char *const exe_paths[] = {"/bin/lsb_release",
-                                     "/usr/bin/lsb_release"};
-
-    for (size_t exe_index = 0;
-         exe_index < sizeof(exe_paths) / sizeof(exe_paths[0]); ++exe_index) {
-      const char *const get_distribution_info_exe = exe_paths[exe_index];
-      if (access(get_distribution_info_exe, F_OK)) {
-        // this exe doesn't exist, move on to next exe
-        LLDB_LOGF(log, "executable doesn't exist: %s",
-                  get_distribution_info_exe);
-        continue;
-      }
-
-      // execute the distribution-retrieval command, read output
-      std::string get_distribution_id_command(get_distribution_info_exe);
-      get_distribution_id_command += " -i";
-
-      FILE *file = popen(get_distribution_id_command.c_str(), "r");
-      if (!file) {
-        LLDB_LOGF(log,
-                  "failed to run command: \"%s\", cannot retrieve "
-                  "platform information",
-                  get_distribution_id_command.c_str());
-        break;
-      }
-
-      // retrieve the distribution id string.
-      char distribution_id[256] = {'\0'};
-      if (fgets(distribution_id, sizeof(distribution_id) - 1, file) !=
-          nullptr) {
-        LLDB_LOGF(log, "distribution id command returned \"%s\"",
-                  distribution_id);
-
-        const char *const distributor_id_key = "Distributor ID:\t";
-        if (strstr(distribution_id, distributor_id_key)) {
-          // strip newlines
-          std::string id_string(distribution_id + strlen(distributor_id_key));
-          llvm::erase(id_string, '\n');
-
-          // lower case it and convert whitespace to underscores
-          std::transform(
-              id_string.begin(), id_string.end(), id_string.begin(),
-              [](char ch) { return tolower(isspace(ch) ? '_' : ch); });
-
-          g_fields->m_distribution_id = id_string;
-          LLDB_LOGF(log, "distribution id set to \"%s\"",
-                    g_fields->m_distribution_id.c_str());
-        } else {
-          LLDB_LOGF(log, "failed to find \"%s\" field in \"%s\"",
-                    distributor_id_key, distribution_id);
-        }
-      } else {
-        LLDB_LOGF(log,
-                  "failed to retrieve distribution id, \"%s\" returned no"
-                  " lines",
-                  get_distribution_id_command.c_str());
-      }
-
-      // clean up the file
-      pclose(file);
-    }
-  });
-
-  return g_fields->m_distribution_id;
-}
-
 FileSpec HostInfoAIX::GetProgramFileSpec() {
   static FileSpec g_program_filespec;
 
@@ -136,19 +44,3 @@ FileSpec HostInfoAIX::GetProgramFileSpec() {
 
   return g_program_filespec;
 }
-
-void HostInfoAIX::ComputeHostArchitectureSupport(ArchSpec &arch_32,
-                                                   ArchSpec &arch_64) {
-  HostInfoPosix::ComputeHostArchitectureSupport(arch_32, arch_64);
-
-  // "unknown" in the vendor slot isn't what we want for the default
-  // triple.  It's probably an artifact of config.guess.
-  if (arch_32.IsValid()) {
-    if (arch_32.GetTriple().getVendor() == llvm::Triple::UnknownVendor)
-      arch_32.GetTriple().setVendorName(llvm::StringRef());
-  }
-  if (arch_64.IsValid()) {
-    if (arch_64.GetTriple().getVendor() == llvm::Triple::UnknownVendor)
-      arch_64.GetTriple().setVendorName(llvm::StringRef());
-  }
-}

From 57d080e44e80203a6ab848c362469954a7c9f067 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sun, 5 Jan 2025 05:49:40 -0600
Subject: [PATCH 21/58] Cleanup HostInfoAIX

Including the previous commit, Removed:
GetDistributionID, ComputeHostArchitectureSupport and
Reduced GetProgramFileSpec as it was not needed
---
 lldb/source/Host/aix/HostInfoAIX.cpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
index 2996fcb55f811..d09b9052668af 100644
--- a/lldb/source/Host/aix/HostInfoAIX.cpp
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -26,21 +26,9 @@ void HostInfoAIX::Initialize(SharedLibraryDirectoryHelper *helper) {
   HostInfoPosix::Initialize(helper);
 }
 
-void HostInfoAIX::Terminate() {
-  HostInfoBase::Terminate();
-}
+void HostInfoAIX::Terminate() { HostInfoBase::Terminate(); }
 
 FileSpec HostInfoAIX::GetProgramFileSpec() {
   static FileSpec g_program_filespec;
-
-  if (!g_program_filespec) {
-    char exe_path[PATH_MAX];
-    ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1);
-    if (len > 0) {
-      exe_path[len] = 0;
-      g_program_filespec.SetFile(exe_path, FileSpec::Style::native);
-    }
-  }
-
   return g_program_filespec;
 }

From 673713a9339de4e4ea395ee2e7f65dc1db43bcf9 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sun, 5 Jan 2025 15:35:23 -0600
Subject: [PATCH 22/58] Removing headers

---
 lldb/include/lldb/Host/aix/HostInfoAIX.h |  2 --
 lldb/source/Host/aix/HostInfoAIX.cpp     | 12 ------------
 2 files changed, 14 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
index 5a52c42fa6199..331a274630850 100644
--- a/lldb/include/lldb/Host/aix/HostInfoAIX.h
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -11,8 +11,6 @@
 
 #include "lldb/Host/posix/HostInfoPosix.h"
 #include "lldb/Utility/FileSpec.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/VersionTuple.h"
 
 namespace lldb_private {
 
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
index d09b9052668af..61b47462dd647 100644
--- a/lldb/source/Host/aix/HostInfoAIX.cpp
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -7,18 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/aix/HostInfoAIX.h"
-#include "lldb/Host/Config.h"
-#include "lldb/Host/FileSystem.h"
-#include "lldb/Utility/LLDBLog.h"
-#include "lldb/Utility/Log.h"
-#include "llvm/Support/Threading.h"
-#include <climits>
-#include <cstdio>
-#include <cstring>
-#include <sys/utsname.h>
-#include <unistd.h>
-#include <algorithm>
-#include <mutex>
 
 using namespace lldb_private;
 

From cdc31f3963365e4595247ff5a7155662df1ce1af Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 09:28:56 -0600
Subject: [PATCH 23/58] Reverted merge blunder CMakeLists

---
 lldb/source/Host/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index f326bc07dc1f6..f4fca8acc4d83 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -141,7 +141,10 @@ else()
 
   elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
     add_host_subdirectory(aix
+      aix/AbstractSocket.cpp
+      aix/Host.cpp
       aix/HostInfoAIX.cpp
+      aix/Support.cpp
       )    
   endif()
 endif()

From 713a6cbbb97b9bc56b039ab050a1690eccc017e3 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 09:39:36 -0600
Subject: [PATCH 24/58] Removed DomainSocket.cpp FileSystemPosix.cpp includes

---
 lldb/source/Host/posix/DomainSocket.cpp    | 4 ----
 lldb/source/Host/posix/FileSystemPosix.cpp | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index f30e84d83efca..be8fcdf2c8f2c 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -17,10 +17,6 @@
 #include <sys/socket.h>
 #include <sys/un.h>
 
-#if defined(_AIX)
-#include <strings.h>
-#endif
-
 using namespace lldb;
 using namespace lldb_private;
 
diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 1a84f550662d7..a631bb01209ec 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,9 +11,6 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
-#ifndef _AIX
-#include <sys/mount.h>
-#endif
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/types.h>

From 0a706d29dabeefa62e354fc9358d650f89032048 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 09:44:10 -0600
Subject: [PATCH 25/58] sys/mount.h

---
 lldb/source/Host/posix/FileSystemPosix.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index a631bb01209ec..945e2affc8371 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,6 +11,7 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
+#include <sys/mount.h>
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/types.h>

From 84ebb4ec9b542c38fe1b60261a3253383e9fbf5d Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 09:54:58 -0600
Subject: [PATCH 26/58] sys/mount.h

---
 lldb/source/Host/posix/FileSystemPosix.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Host/posix/FileSystemPosix.cpp b/lldb/source/Host/posix/FileSystemPosix.cpp
index 945e2affc8371..1a84f550662d7 100644
--- a/lldb/source/Host/posix/FileSystemPosix.cpp
+++ b/lldb/source/Host/posix/FileSystemPosix.cpp
@@ -11,7 +11,9 @@
 // C includes
 #include <dirent.h>
 #include <fcntl.h>
+#ifndef _AIX
 #include <sys/mount.h>
+#endif
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <sys/types.h>

From 844f7980040de9e13620e9d65a3fcaef56b6c8d6 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 9 Jan 2025 09:47:43 +0530
Subject: [PATCH 27/58] Removed _AIX from ConnectionFileDescriptorPosix.cpp

---
 lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
index 2530c8fa353ba..0ed2016667162 100644
--- a/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
+++ b/lldb/source/Host/posix/ConnectionFileDescriptorPosix.cpp
@@ -715,7 +715,7 @@ ConnectionFileDescriptor::ConnectFD(llvm::StringRef s,
 ConnectionStatus ConnectionFileDescriptor::ConnectFile(
     llvm::StringRef s, socket_id_callback_type socket_id_callback,
     Status *error_ptr) {
-#if LLDB_ENABLE_POSIX && !defined(_AIX)
+#if LLDB_ENABLE_POSIX
   std::string addr_str = s.str();
   // file:///PATH
   int fd = FileSystem::Instance().Open(addr_str.c_str(), O_RDWR);
@@ -756,7 +756,7 @@ ConnectionStatus ConnectionFileDescriptor::ConnectFile(
 
   m_io_sp = std::make_shared<NativeFile>(fd, File::eOpenOptionReadWrite, true);
   return eConnectionStatusSuccess;
-#endif // LLDB_ENABLE_POSIX && !defined(_AIX)
+#endif // LLDB_ENABLE_POSIX
   llvm_unreachable("this function should be only called w/ LLDB_ENABLE_POSIX");
 }
 

From 4fe42cda7c2f4990b18a39c1d6563094fb88775f Mon Sep 17 00:00:00 2001
From: ravindra shinde <ravindrarshinde212@ibm.com>
Date: Fri, 17 Jan 2025 13:58:13 +0530
Subject: [PATCH 28/58] [ObjectFileXCOFF] Fix access to protected member
 'GetSectionLoadList' in Target

- Added a public method to Target for accessing 'GetSectionLoadList' safely.
- Updated ObjectFileXCOFF to use the new public method, ensuring compliance
  with encapsulation rules.

This resolves the build error caused by direct access to the protected member.

Signed-off-by: ravindra shinde <ravindrarshinde212@ibm.com>
---
 lldb/include/lldb/Target/Target.h                        | 5 +++++
 lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index f31ac381391b4..75f9c9c2e999c 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -522,6 +522,7 @@ class Target : public std::enable_shared_from_this<Target>,
     eBroadcastBitSymbolsChanged = (1 << 5),
   };
 
+
   // These two functions fill out the Broadcaster interface:
 
   static llvm::StringRef GetStaticBroadcasterClass();
@@ -1644,6 +1645,10 @@ class Target : public std::enable_shared_from_this<Target>,
 
   TargetStats &GetStatistics() { return m_stats; }
 
+public:
+  SectionLoadList &GetSectionLoadListPublic() {
+    return GetSectionLoadList();
+  }
 protected:
   /// Construct with optional file and arch.
   ///
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index afd8027bab06c..cf11e5fb8f5a3 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -340,7 +340,7 @@ bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
               strcmp(section_sp->GetName().AsCString(), ".bss") == 0)
             use_offset = true;
 
-          if (target.GetSectionLoadList().SetSectionLoadAddress(
+          if (target.GetSectionLoadListPublic().SetSectionLoadAddress(
                   section_sp, (use_offset ?
                   (section_sp->GetFileOffset() + value) : (section_sp->GetFileAddress() + value))))
             ++num_loaded_sections;
@@ -369,13 +369,13 @@ bool ObjectFileXCOFF::SetLoadAddressByType(Target &target, lldb::addr_t value,
         SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
         if (type_id == 1 && section_sp && strcmp(section_sp->GetName().AsCString(), ".text") == 0) {
           if (!section_sp->IsThreadSpecific()) {
-            if (target.GetSectionLoadList().SetSectionLoadAddress(
+            if (target.GetSectionLoadListPublic().SetSectionLoadAddress(
                     section_sp, section_sp->GetFileOffset() + value))
               ++num_loaded_sections;
           }
         } else if (type_id == 2 && section_sp && strcmp(section_sp->GetName().AsCString(), ".data") == 0) {
           if (!section_sp->IsThreadSpecific()) {
-            if (target.GetSectionLoadList().SetSectionLoadAddress(
+            if (target.GetSectionLoadListPublic().SetSectionLoadAddress(
                     section_sp, section_sp->GetFileAddress() + value))
               ++num_loaded_sections;
           }

From e5ed4f21c5bbc709e5e2eff0f83995cc6d89de48 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sun, 19 Jan 2025 03:43:11 -0600
Subject: [PATCH 29/58] Resolved cmake failure for SBProgress.cpp

---
 lldb/source/API/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index eb348e2b97232..0a03e000c0cae 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -85,6 +85,7 @@ add_lldb_library(liblldb STATIC ${option_framework}
   SBModuleSpec.cpp
   SBPlatform.cpp
   SBProcess.cpp
+  SBProgress.cpp
   SBProcessInfo.cpp
   SBProcessInfoList.cpp
   SBQueue.cpp

From 82dbcb0e776c438e5f40c9f8d8c8e8eb81b7febd Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 27 Jan 2025 06:35:19 -0600
Subject: [PATCH 30/58] Host.cpp ANDROID

---
 lldb/source/Host/common/Host.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 758e9f49ade2c..adf74df8aa90b 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -1,4 +1,4 @@
-//===-- Host.cpp ----------------------------------------------------------===//
+
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -516,7 +516,6 @@ static int dladdr(const void *ptr, Dl_info *dl)
 
 FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
   FileSpec module_filespec;
-#if !defined(__ANDROID__)
 #ifdef _AIX
   if (host_addr == reinterpret_cast<void *>(HostInfoBase::ComputeSharedLibraryDirectory)) {
     // FIXME: AIX dladdr return "lldb" for this case
@@ -527,6 +526,7 @@ FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
     }
   }
 #endif
+#if !defined(__ANDROID__)
   Dl_info info;
   if (::dladdr(host_addr, &info)) {
     if (info.dli_fname) {
@@ -534,6 +534,7 @@ FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
       FileSystem::Instance().Resolve(module_filespec);
     }
   }
+#endif
   return module_filespec;
 }
 

From 60294eaa1611632afc94b9da503752e34ad2703d Mon Sep 17 00:00:00 2001
From: Ravindra Shinde <Ravindra.Shinde2@ibm.com>
Date: Tue, 4 Feb 2025 03:23:56 -0600
Subject: [PATCH 31/58] Resolving the fatal error while build

Build is failing due to the fatal error: 'sys/syscall.h' file not found
Signed-off-by: Ravindra Shinde <Ravindra.Shinde2@ibm.com>
---
 lldb/source/Host/common/Host.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 40ce76d70d6e8..b5bd68b8539bd 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -18,8 +18,12 @@
 #include <netdb.h>
 #include <pwd.h>
 #include <sys/stat.h>
+
+#ifndef _AIX
 #include <sys/syscall.h>
 #include <sys/wait.h>
+#endif
+
 #include <unistd.h>
 #include <spawn.h>
 #endif

From 2644be59b13a61c69cc635875c94b0b4645fe76c Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 10 Feb 2025 01:49:12 -0600
Subject: [PATCH 32/58] InferiorCallPOSIX.cpp

---
 lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
index d1f9fe851119e..8e74cce097894 100644
--- a/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
+++ b/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp
@@ -120,12 +120,13 @@ bool lldb_private::InferiorCallMmap(Process *process, addr_t &allocated_addr,
                 arch, addr, length, prot_arg, flags, fd, offset);
 #if defined(_AIX)
         lldb::ThreadPlanSP call_plan_sp(
-            new ThreadPlanCallFunction(*thread, mmap_range.GetBaseAddress(),
+            new ThreadPlanCallFunction(*thread, mmap_addr,
                                        toc_range.GetBaseAddress(),
                                        void_ptr_type, args, options));
 #else
         lldb::ThreadPlanSP call_plan_sp(new ThreadPlanCallFunction(
             *thread, mmap_addr, void_ptr_type, args, options));
+#endif
         if (call_plan_sp) {
           DiagnosticManager diagnostics;
 

From 4805b13cba964b58def39a66ad4c4309a9b2c501 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 12 Feb 2025 07:33:04 -0600
Subject: [PATCH 33/58] Merge branch gh-101657

---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         | 97 -------------------
 .../AIX-DYLD/DynamicLoaderAIXDYLD.h           |  5 +-
 2 files changed, 1 insertion(+), 101 deletions(-)

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 1a98bb9334043..375d879c7a995 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -21,10 +21,6 @@
 #include "llvm/Support/FileSystem.h"
 #if defined(_AIX)
 #include <sys/ldr.h>
-#include <procinfo.h>
-#include <sys/procfs.h>
-#include <iostream>
-#include <fstream>
 #endif
 
 /*#include "llvm/ADT/Triple.h"
@@ -137,107 +133,14 @@ bool DynamicLoaderAIXDYLD::NotifyBreakpointHit(
 }
 
 
-void DynamicLoaderAIXDYLD::ResolveExecutableModule(
-    lldb::ModuleSP &module_sp) {
-  Log *log = GetLog(LLDBLog::DynamicLoader);
-
-  if (m_process == nullptr)
-    return;
-
-  auto &target = m_process->GetTarget();
-  const auto platform_sp = target.GetPlatform();
-
-  ProcessInstanceInfo process_info;
-  if (!m_process->GetProcessInfo(process_info)) {
-    LLDB_LOGF(log,
-              "DynamicLoaderPOSIXDYLD::%s - failed to get process info for "
-              "pid %" PRIu64,
-              __FUNCTION__, m_process->GetID());
-    return;
-  }
-
-  int32long64_t pid = m_process->GetID();
-  char cwd[PATH_MAX], resolved_path[PATH_MAX];
-  std::string executable_name;
-  bool path_resolved = false;
-  psinfo_t psinfo;
-
-  std::string proc_file = "/proc/" + std::to_string(pid) + "/psinfo";
-  std::string cwd_link = "/proc/" + std::to_string(pid) + "/cwd";
-  std::ifstream file(proc_file, std::ios::binary);
-  if(!file.is_open())
-      LLDB_LOGF(log, "Error: Unable to access process info ");
-
-  file.read(reinterpret_cast<char*>(&psinfo), sizeof(psinfo_t));
-  if(!file)
-      LLDB_LOGF(log, "Process info error: Failed to read ");
-
-  std::string relative_path(psinfo.pr_fname);
-  LLDB_LOGF(log, "Relative path %s",relative_path.c_str());
-
-  if(readlink(cwd_link.c_str(), cwd, sizeof(cwd)) != -1){
-      std::filesystem::path full_path = std::filesystem::path(cwd)/relative_path; 
-  if(realpath(full_path.c_str(), resolved_path)) {
-      LLDB_LOGF(log, "Resolved Path using process info : %s", resolved_path);
-      path_resolved = true;
-  }
-  else
-      LLDB_LOGF(log, "Realpath error: Unable to resolve. ");
-  }
-  
-  executable_name = resolved_path;
-  if(path_resolved == false) {
-      std::string command_line(psinfo.pr_psargs);
-      LLDB_LOGF(log, "Command line: %s",command_line.c_str());
-      if (!command_line.empty()) {
-          size_t space1 = command_line.find(' ');
-          executable_name = command_line.substr(0, space1);
-          LLDB_LOGF(log, "Resolved path using command line arg %s",executable_name.c_str());
-      } 
-  }
-
-  LLDB_LOGF(log, "Executable Name %s",executable_name.c_str());
-  process_info.SetExecutableFile(lldb_private::FileSpec(executable_name),
-          true);
- 
-  LLDB_LOGF(
-      log, "DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s",
-      __FUNCTION__, m_process->GetID(),
-      process_info.GetExecutableFile().GetPath().c_str());
-
-  ModuleSpec module_spec(process_info.GetExecutableFile(),
-                         process_info.GetArchitecture());
-  if (module_sp && module_sp->MatchesModuleSpec(module_spec))
-    return;
-
-  const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths());
-  auto error = platform_sp->ResolveExecutable(
-      module_spec, module_sp,
-      !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr);
-  if (error.Fail()) {
-    StreamString stream;
-    module_spec.Dump(stream);
-
-    LLDB_LOGF(log,
-              "DynamicLoaderPOSIXDYLD::%s - failed to resolve executable "
-              "with module spec \"%s\": %s",
-              __FUNCTION__, stream.GetData(), error.AsCString());
-    return;
-  }
-
-  target.SetExecutableModule(module_sp, eLoadDependentsNo);
-}
-
 void DynamicLoaderAIXDYLD::DidAttach() {
   Log *log = GetLog(LLDBLog::DynamicLoader);
   LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   ModuleSP executable = GetTargetExecutable();
-  ResolveExecutableModule(executable);
 
   if (!executable.get())
     return;
-  LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   // Try to fetch the load address of the file from the process, since there
   // could be randomization of the load address.
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
index 0ffbe688e0069..ae4b7aca66dcc 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
@@ -24,7 +24,7 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 
   static void Initialize();
   static void Terminate();
-  static llvm::StringRef GetPluginNameStatic() { return "aix-dyld"; }
+  static llvm::StringRef GetPluginNameStatic() { return "windows-dyld"; }
   static llvm::StringRef GetPluginDescriptionStatic();
 
   static DynamicLoader *CreateInstance(Process *process, bool force);
@@ -46,9 +46,6 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 protected:
   lldb::addr_t GetLoadAddress(lldb::ModuleSP executable);
 
-  /// Loads Module from inferior process.
-  void ResolveExecutableModule(lldb::ModuleSP &module_sp);
-
 private:
   std::map<lldb::ModuleSP, lldb::addr_t> m_loaded_modules;
 };

From cff574b36903e12385e5d6cddf4532ba279f47de Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 12 Feb 2025 07:52:04 -0600
Subject: [PATCH 34/58] Fix for Debugging Attach to AIX Process

---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         | 97 +++++++++++++++++++
 .../AIX-DYLD/DynamicLoaderAIXDYLD.h           |  5 +-
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 375d879c7a995..1a98bb9334043 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -21,6 +21,10 @@
 #include "llvm/Support/FileSystem.h"
 #if defined(_AIX)
 #include <sys/ldr.h>
+#include <procinfo.h>
+#include <sys/procfs.h>
+#include <iostream>
+#include <fstream>
 #endif
 
 /*#include "llvm/ADT/Triple.h"
@@ -133,14 +137,107 @@ bool DynamicLoaderAIXDYLD::NotifyBreakpointHit(
 }
 
 
+void DynamicLoaderAIXDYLD::ResolveExecutableModule(
+    lldb::ModuleSP &module_sp) {
+  Log *log = GetLog(LLDBLog::DynamicLoader);
+
+  if (m_process == nullptr)
+    return;
+
+  auto &target = m_process->GetTarget();
+  const auto platform_sp = target.GetPlatform();
+
+  ProcessInstanceInfo process_info;
+  if (!m_process->GetProcessInfo(process_info)) {
+    LLDB_LOGF(log,
+              "DynamicLoaderPOSIXDYLD::%s - failed to get process info for "
+              "pid %" PRIu64,
+              __FUNCTION__, m_process->GetID());
+    return;
+  }
+
+  int32long64_t pid = m_process->GetID();
+  char cwd[PATH_MAX], resolved_path[PATH_MAX];
+  std::string executable_name;
+  bool path_resolved = false;
+  psinfo_t psinfo;
+
+  std::string proc_file = "/proc/" + std::to_string(pid) + "/psinfo";
+  std::string cwd_link = "/proc/" + std::to_string(pid) + "/cwd";
+  std::ifstream file(proc_file, std::ios::binary);
+  if(!file.is_open())
+      LLDB_LOGF(log, "Error: Unable to access process info ");
+
+  file.read(reinterpret_cast<char*>(&psinfo), sizeof(psinfo_t));
+  if(!file)
+      LLDB_LOGF(log, "Process info error: Failed to read ");
+
+  std::string relative_path(psinfo.pr_fname);
+  LLDB_LOGF(log, "Relative path %s",relative_path.c_str());
+
+  if(readlink(cwd_link.c_str(), cwd, sizeof(cwd)) != -1){
+      std::filesystem::path full_path = std::filesystem::path(cwd)/relative_path; 
+  if(realpath(full_path.c_str(), resolved_path)) {
+      LLDB_LOGF(log, "Resolved Path using process info : %s", resolved_path);
+      path_resolved = true;
+  }
+  else
+      LLDB_LOGF(log, "Realpath error: Unable to resolve. ");
+  }
+  
+  executable_name = resolved_path;
+  if(path_resolved == false) {
+      std::string command_line(psinfo.pr_psargs);
+      LLDB_LOGF(log, "Command line: %s",command_line.c_str());
+      if (!command_line.empty()) {
+          size_t space1 = command_line.find(' ');
+          executable_name = command_line.substr(0, space1);
+          LLDB_LOGF(log, "Resolved path using command line arg %s",executable_name.c_str());
+      } 
+  }
+
+  LLDB_LOGF(log, "Executable Name %s",executable_name.c_str());
+  process_info.SetExecutableFile(lldb_private::FileSpec(executable_name),
+          true);
+ 
+  LLDB_LOGF(
+      log, "DynamicLoaderPOSIXDYLD::%s - got executable by pid %" PRIu64 ": %s",
+      __FUNCTION__, m_process->GetID(),
+      process_info.GetExecutableFile().GetPath().c_str());
+
+  ModuleSpec module_spec(process_info.GetExecutableFile(),
+                         process_info.GetArchitecture());
+  if (module_sp && module_sp->MatchesModuleSpec(module_spec))
+    return;
+
+  const auto executable_search_paths(Target::GetDefaultExecutableSearchPaths());
+  auto error = platform_sp->ResolveExecutable(
+      module_spec, module_sp,
+      !executable_search_paths.IsEmpty() ? &executable_search_paths : nullptr);
+  if (error.Fail()) {
+    StreamString stream;
+    module_spec.Dump(stream);
+
+    LLDB_LOGF(log,
+              "DynamicLoaderPOSIXDYLD::%s - failed to resolve executable "
+              "with module spec \"%s\": %s",
+              __FUNCTION__, stream.GetData(), error.AsCString());
+    return;
+  }
+
+  target.SetExecutableModule(module_sp, eLoadDependentsNo);
+}
+
 void DynamicLoaderAIXDYLD::DidAttach() {
   Log *log = GetLog(LLDBLog::DynamicLoader);
   LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   ModuleSP executable = GetTargetExecutable();
+  ResolveExecutableModule(executable);
 
   if (!executable.get())
     return;
+  LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
 
   // Try to fetch the load address of the file from the process, since there
   // could be randomization of the load address.
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
index ae4b7aca66dcc..0ffbe688e0069 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
@@ -24,7 +24,7 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 
   static void Initialize();
   static void Terminate();
-  static llvm::StringRef GetPluginNameStatic() { return "windows-dyld"; }
+  static llvm::StringRef GetPluginNameStatic() { return "aix-dyld"; }
   static llvm::StringRef GetPluginDescriptionStatic();
 
   static DynamicLoader *CreateInstance(Process *process, bool force);
@@ -46,6 +46,9 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
 protected:
   lldb::addr_t GetLoadAddress(lldb::ModuleSP executable);
 
+  /// Loads Module from inferior process.
+  void ResolveExecutableModule(lldb::ModuleSP &module_sp);
+
 private:
   std::map<lldb::ModuleSP, lldb::addr_t> m_loaded_modules;
 };

From 303fa3bc078d7572702fb302726d4dd1bd13ddb2 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 21 Feb 2025 06:18:33 -0600
Subject: [PATCH 35/58] Merge branch 'llvm:main' into gh-101657

---
 lldb/source/Plugins/Platform/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/source/Plugins/Platform/CMakeLists.txt b/lldb/source/Plugins/Platform/CMakeLists.txt
index f5b0dbdd5e0a9..0220e734b36d1 100644
--- a/lldb/source/Plugins/Platform/CMakeLists.txt
+++ b/lldb/source/Plugins/Platform/CMakeLists.txt
@@ -9,4 +9,3 @@ add_subdirectory(OpenBSD)
 add_subdirectory(POSIX)
 add_subdirectory(QemuUser)
 add_subdirectory(Windows)
-add_subdirectory(AIX)

From 6947dec02bb527f710c1bea3827b2c16d89f308a Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 21 Feb 2025 07:02:53 -0600
Subject: [PATCH 36/58] Merge branch 'llvm:main' into gh-101657

---
 .../Plugins/Platform/AIX/PlatformAIX.cpp      | 171 +-----------------
 1 file changed, 1 insertion(+), 170 deletions(-)

diff --git a/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
index 0d66325d16267..21724d83133e9 100644
--- a/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
+++ b/lldb/source/Plugins/Platform/AIX/PlatformAIX.cpp
@@ -130,12 +130,6 @@ void PlatformAIX::GetStatus(Stream &strm) {
 #endif
 }
 
-void PlatformAIX::CalculateTrapHandlerSymbolNames() {
-  m_trap_handlers.push_back(ConstString("_sigtramp"));
-  m_trap_handlers.push_back(ConstString("__kernel_rt_sigreturn"));
-  m_trap_handlers.push_back(ConstString("__restore_rt"));
-}
-
 void PlatformAIX::CalculateTrapHandlerSymbolNames() {}
 
 lldb::UnwindPlanSP
@@ -160,168 +154,5 @@ MmapArgList PlatformAIX::GetMmapArgumentList(const ArchSpec &arch, addr_t addr,
 }
 
 CompilerType PlatformAIX::GetSiginfoType(const llvm::Triple &triple) {
-  if (!m_type_system_up)
-    m_type_system_up.reset(new TypeSystemClang("siginfo", triple));
-  TypeSystemClang *ast = m_type_system_up.get();
-
-  bool si_errno_then_code = true;
-
-  switch (triple.getArch()) {
-  case llvm::Triple::mips:
-  case llvm::Triple::mipsel:
-  case llvm::Triple::mips64:
-  case llvm::Triple::mips64el:
-    // mips has si_code and si_errno swapped
-    si_errno_then_code = false;
-    break;
-  default:
-    break;
-  }
-
-  // generic types
-  CompilerType int_type = ast->GetBasicType(eBasicTypeInt);
-  CompilerType uint_type = ast->GetBasicType(eBasicTypeUnsignedInt);
-  CompilerType short_type = ast->GetBasicType(eBasicTypeShort);
-  CompilerType long_type = ast->GetBasicType(eBasicTypeLong);
-  CompilerType voidp_type = ast->GetBasicType(eBasicTypeVoid).GetPointerType();
-
-  // platform-specific types
-  CompilerType &pid_type = int_type;
-  CompilerType &uid_type = uint_type;
-  CompilerType &clock_type = long_type;
-  CompilerType &band_type = long_type;
-
-  CompilerType sigval_type = ast->CreateRecordType(
-      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_sigval_t",
-      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
-  ast->StartTagDeclarationDefinition(sigval_type);
-  ast->AddFieldToRecordType(sigval_type, "sival_int", int_type,
-                            lldb::eAccessPublic, 0);
-  ast->AddFieldToRecordType(sigval_type, "sival_ptr", voidp_type,
-                            lldb::eAccessPublic, 0);
-  ast->CompleteTagDeclarationDefinition(sigval_type);
-
-  CompilerType sigfault_bounds_type = ast->CreateRecordType(
-      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
-  ast->StartTagDeclarationDefinition(sigfault_bounds_type);
-  ast->AddFieldToRecordType(sigfault_bounds_type, "_addr_bnd",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"_lower", voidp_type},
-                                         {"_upper", voidp_type},
-                                     }),
-                            lldb::eAccessPublic, 0);
-  ast->AddFieldToRecordType(sigfault_bounds_type, "_pkey", uint_type,
-                            lldb::eAccessPublic, 0);
-  ast->CompleteTagDeclarationDefinition(sigfault_bounds_type);
-
-  // siginfo_t
-  CompilerType siginfo_type = ast->CreateRecordType(
-      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "__lldb_siginfo_t",
-      llvm::to_underlying(clang::TagTypeKind::Struct), lldb::eLanguageTypeC);
-  ast->StartTagDeclarationDefinition(siginfo_type);
-  ast->AddFieldToRecordType(siginfo_type, "si_signo", int_type,
-                            lldb::eAccessPublic, 0);
-
-  if (si_errno_then_code) {
-    ast->AddFieldToRecordType(siginfo_type, "si_errno", int_type,
-                              lldb::eAccessPublic, 0);
-    ast->AddFieldToRecordType(siginfo_type, "si_code", int_type,
-                              lldb::eAccessPublic, 0);
-  } else {
-    ast->AddFieldToRecordType(siginfo_type, "si_code", int_type,
-                              lldb::eAccessPublic, 0);
-    ast->AddFieldToRecordType(siginfo_type, "si_errno", int_type,
-                              lldb::eAccessPublic, 0);
-  }
-
-  // the structure is padded on 64-bit arches to fix alignment
-  if (triple.isArch64Bit())
-    ast->AddFieldToRecordType(siginfo_type, "__pad0", int_type,
-                              lldb::eAccessPublic, 0);
-
-  // union used to hold the signal data
-  CompilerType union_type = ast->CreateRecordType(
-      nullptr, OptionalClangModuleID(), lldb::eAccessPublic, "",
-      llvm::to_underlying(clang::TagTypeKind::Union), lldb::eLanguageTypeC);
-  ast->StartTagDeclarationDefinition(union_type);
-
-  ast->AddFieldToRecordType(
-      union_type, "_kill",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_pid", pid_type},
-                                         {"si_uid", uid_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->AddFieldToRecordType(
-      union_type, "_timer",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_tid", int_type},
-                                         {"si_overrun", int_type},
-                                         {"si_sigval", sigval_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->AddFieldToRecordType(
-      union_type, "_rt",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_pid", pid_type},
-                                         {"si_uid", uid_type},
-                                         {"si_sigval", sigval_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->AddFieldToRecordType(
-      union_type, "_sigchld",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_pid", pid_type},
-                                         {"si_uid", uid_type},
-                                         {"si_status", int_type},
-                                         {"si_utime", clock_type},
-                                         {"si_stime", clock_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->AddFieldToRecordType(
-      union_type, "_sigfault",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_addr", voidp_type},
-                                         {"si_addr_lsb", short_type},
-                                         {"_bounds", sigfault_bounds_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->AddFieldToRecordType(
-      union_type, "_sigpoll",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"si_band", band_type},
-                                         {"si_fd", int_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  // NB: SIGSYS is not present on ia64 but we don't seem to support that
-  ast->AddFieldToRecordType(
-      union_type, "_sigsys",
-      ast->CreateStructForIdentifier(ConstString(),
-                                     {
-                                         {"_call_addr", voidp_type},
-                                         {"_syscall", int_type},
-                                         {"_arch", uint_type},
-                                     }),
-      lldb::eAccessPublic, 0);
-
-  ast->CompleteTagDeclarationDefinition(union_type);
-  ast->AddFieldToRecordType(siginfo_type, "_sifields", union_type,
-                            lldb::eAccessPublic, 0);
-
-  ast->CompleteTagDeclarationDefinition(siginfo_type);
-  return siginfo_type;
+  return CompilerType();
 }

From 24615119addcdf9fe5a8c5b2ebd0c15d75651279 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Sat, 22 Feb 2025 03:42:26 -0600
Subject: [PATCH 37/58] Removed un-needed changes

---
 lldb/include/lldb/Host/aix/AbstractSocket.h | 25 ---------------------
 lldb/include/lldb/Host/aix/Uio.h            | 23 -------------------
 lldb/source/Host/CMakeLists.txt             |  1 -
 lldb/source/Host/aix/AbstractSocket.cpp     | 20 -----------------
 4 files changed, 69 deletions(-)
 delete mode 100644 lldb/include/lldb/Host/aix/AbstractSocket.h
 delete mode 100644 lldb/include/lldb/Host/aix/Uio.h
 delete mode 100644 lldb/source/Host/aix/AbstractSocket.cpp

diff --git a/lldb/include/lldb/Host/aix/AbstractSocket.h b/lldb/include/lldb/Host/aix/AbstractSocket.h
deleted file mode 100644
index accfd01457a5e..0000000000000
--- a/lldb/include/lldb/Host/aix/AbstractSocket.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- AbstractSocket.h ----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef liblldb_AbstractSocket_h_
-#define liblldb_AbstractSocket_h_
-
-#include "lldb/Host/posix/DomainSocket.h"
-
-namespace lldb_private {
-class AbstractSocket : public DomainSocket {
-public:
-  AbstractSocket();
-
-protected:
-  size_t GetNameOffset() const override;
-  void DeleteSocketFile(llvm::StringRef name) override;
-};
-}
-
-#endif // ifndef liblldb_AbstractSocket_h_
diff --git a/lldb/include/lldb/Host/aix/Uio.h b/lldb/include/lldb/Host/aix/Uio.h
deleted file mode 100644
index acf79ecc6a1d0..0000000000000
--- a/lldb/include/lldb/Host/aix/Uio.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- Uio.h ---------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef liblldb_Host_aix_Uio_h_
-#define liblldb_Host_aix_Uio_h_
-
-#include "lldb/Host/Config.h"
-#include <sys/uio.h>
-
-// We shall provide our own implementation of process_vm_readv if it is not
-// present
-#if !HAVE_PROCESS_VM_READV
-ssize_t process_vm_readv(::pid_t pid, const struct iovec *local_iov,
-                         unsigned long liovcnt, const struct iovec *remote_iov,
-                         unsigned long riovcnt, unsigned long flags);
-#endif
-
-#endif // liblldb_Host_aix_Uio_h_
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index bb6b5befa16e4..5a14b4c629825 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -140,7 +140,6 @@ else()
 
   elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
     add_host_subdirectory(aix
-      aix/AbstractSocket.cpp
       aix/Host.cpp
       aix/HostInfoAIX.cpp
       aix/Support.cpp
diff --git a/lldb/source/Host/aix/AbstractSocket.cpp b/lldb/source/Host/aix/AbstractSocket.cpp
deleted file mode 100644
index fddf78f54f46d..0000000000000
--- a/lldb/source/Host/aix/AbstractSocket.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===-- AbstractSocket.cpp ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Host/aix/AbstractSocket.h"
-
-#include "llvm/ADT/StringRef.h"
-
-using namespace lldb;
-using namespace lldb_private;
-
-AbstractSocket::AbstractSocket() : DomainSocket(ProtocolUnixAbstract) {}
-
-size_t AbstractSocket::GetNameOffset() const { return 1; }
-
-void AbstractSocket::DeleteSocketFile(llvm::StringRef name) {}

From 25bea9ca48dc458c1dddd72f10a483e76926a04e Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Wed, 26 Feb 2025 01:10:15 -0600
Subject: [PATCH 38/58] Resolving coredump issue while attach with library
 calls

---
 lldb/include/lldb/Core/ModuleSpec.h |  2 ++
 lldb/source/Core/ModuleList.cpp     | 12 +++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 4fe06412b6b0b..9d79992b48c7e 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -122,6 +122,8 @@ class ModuleSpec {
   ConstString &GetObjectName() { return m_object_name; }
 
   ConstString GetObjectName() const { return m_object_name; }
+  
+  void SetObjectName(ConstString objName) { m_object_name = objName; }
 
   uint64_t GetObjectOffset() const { return m_object_offset; }
 
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 2b8ccab2406c6..862a2729c1afb 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -260,7 +260,17 @@ void ModuleList::ReplaceEquivalent(
                                       module_sp->GetArchitecture());
     equivalent_module_spec.GetPlatformFileSpec() =
         module_sp->GetPlatformFileSpec();
-
+#ifdef _AIX
+    // To remove the exact equivalent module, the object name must be 
+    // specified. When the equivalent_module_spec object is created, its 
+    // object name is initially set to NULL. This is because the module_sp's 
+    // GetPath() returns a path in the format (/usr/ccs/libc.a), which does 
+    // not include the object name. As a result, MatchesModuleSpec may return 
+    // true even though the object name is NULL and doesn't match any loaded 
+    // module. To fix this, set the object name of the equivalent_module_spec 
+    // to be the same as the object name of the module_sp. */
+    equivalent_module_spec.SetObjectName(module_sp->GetObjectName());
+#endif
     size_t idx = 0;
     while (idx < m_modules.size()) {
       ModuleSP test_module_sp(m_modules[idx]);

From 349ec0064668a0ee1d3af7c2f38fa7427b4b2d36 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 28 Feb 2025 23:44:40 +0530
Subject: [PATCH 39/58] [AIX][Coredump] AIX Coredump debugging Implementation
 (#25)

Creates a general framework to be able to debug an AIX generated coredump file.
At this point, we are only supporting 64-bit core files using this general framework.
With this implementation, LLDB can recognise and debug any 64-bit AIX coredump file.

Most of the generic debugging commands work after this:

# bin/lldb --core /home/dhruv/LLDB/tests/core
(lldb) target create --core "/home/dhruv/LLDB/tests/core"
Core file '/home/dhruv/LLDB/tests/core' (powerpc64) was loaded.
(lldb) bt
* thread #1, stop reason = SIGSEGV
  * frame #0: 0x0000000100000940 coretest64`main at core.c:5
    frame #1: 0x00000001000004ac coretest64`__start + 116
(lldb) process status
Process 18446744071562067991 stopped
* thread #1, stop reason = SIGSEGV
    frame #0: 0x0000000100000940 coretest64`main at core.c:5
   2        char *str;
   3        int a = 10;
   4        str = "GfG";
-> 5        *(str+1) = 'n';
   6        return a;
   7     }

And others like memory read, image list, image dump sections, disassembly etc
---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp         |  62 ++++-
 .../AIX-DYLD/DynamicLoaderAIXDYLD.h           |   6 +
 .../Plugins/ObjectFile/AIXCore/CMakeLists.txt |  13 +
 .../ObjectFile/AIXCore/ObjectFileAIXCore.cpp  | 254 ++++++++++++++++++
 .../ObjectFile/AIXCore/ObjectFileAIXCore.h    | 121 +++++++++
 lldb/source/Plugins/ObjectFile/CMakeLists.txt |   1 +
 lldb/source/Plugins/Process/CMakeLists.txt    |   1 +
 .../Plugins/Process/aix-core/AIXCore.cpp      | 116 ++++++++
 .../source/Plugins/Process/aix-core/AIXCore.h | 125 +++++++++
 .../Plugins/Process/aix-core/CMakeLists.txt   |  16 ++
 .../Process/aix-core/ProcessAIXCore.cpp       | 251 +++++++++++++++++
 .../Plugins/Process/aix-core/ProcessAIXCore.h | 100 +++++++
 .../aix-core/RegisterContextCoreAIX_ppc64.cpp | 136 ++++++++++
 .../aix-core/RegisterContextCoreAIX_ppc64.h   |  46 ++++
 .../Process/aix-core/ThreadAIXCore.cpp        | 127 +++++++++
 .../Plugins/Process/aix-core/ThreadAIXCore.h  | 110 ++++++++
 16 files changed, 1484 insertions(+), 1 deletion(-)
 create mode 100644 lldb/source/Plugins/ObjectFile/AIXCore/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
 create mode 100644 lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.h
 create mode 100644 lldb/source/Plugins/Process/aix-core/AIXCore.cpp
 create mode 100644 lldb/source/Plugins/Process/aix-core/AIXCore.h
 create mode 100644 lldb/source/Plugins/Process/aix-core/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
 create mode 100644 lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
 create mode 100644 lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.cpp
 create mode 100644 lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.h
 create mode 100644 lldb/source/Plugins/Process/aix-core/ThreadAIXCore.cpp
 create mode 100644 lldb/source/Plugins/Process/aix-core/ThreadAIXCore.h

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 1a98bb9334043..7e44ffbb1c051 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -228,6 +228,65 @@ void DynamicLoaderAIXDYLD::ResolveExecutableModule(
   target.SetExecutableModule(module_sp, eLoadDependentsNo);
 }
 
+bool DynamicLoaderAIXDYLD::IsCoreFile() const {
+  return !m_process->IsLiveDebugSession();
+}
+
+void DynamicLoaderAIXDYLD::FillCoreLoaderData(lldb_private::DataExtractor &data,
+        uint64_t loader_offset, uint64_t loader_size ) {
+    
+    static char *buffer = (char *)malloc(loader_size);
+    struct ld_info ldinfo[64];
+    char *buffer_complete;
+    struct ld_info *ptr;
+    int i = 0;
+    
+    Log *log = GetLog(LLDBLog::DynamicLoader);
+    LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
+    ByteOrder byteorder = data.GetByteOrder();
+    data.ExtractBytes(loader_offset, loader_size, eByteOrderBig, buffer);
+    buffer_complete = buffer + loader_size;
+    ldinfo[0].ldinfo_next = 1;
+    
+    while (ldinfo[i++].ldinfo_next != 0) {
+        
+        ptr = (struct ld_info *)buffer;
+        ldinfo[i].ldinfo_next = ptr->ldinfo_next;
+        ldinfo[i].ldinfo_flags = ptr->ldinfo_flags;
+        ldinfo[i].ldinfo_core = ptr->ldinfo_core;
+        ldinfo[i].ldinfo_textorg = ptr->ldinfo_textorg;
+        ldinfo[i].ldinfo_textsize = ptr->ldinfo_textsize;
+        ldinfo[i].ldinfo_dataorg = ptr->ldinfo_dataorg;
+        ldinfo[i].ldinfo_datasize = ptr->ldinfo_datasize;
+        
+        char *filename = &ptr->ldinfo_filename[0];
+        char *membername = filename + (strlen(filename) + 1);
+        strcpy(ldinfo[i].ldinfo_filename, filename);
+        
+        buffer += ptr->ldinfo_next;
+        struct ld_info *ptr2 = &(ldinfo[i]);
+        char *pathName = ptr2->ldinfo_filename;
+        char pathWithMember[PATH_MAX] = {0};
+        if (strlen(membername) > 0) {
+            sprintf(pathWithMember, "%s(%s)", pathName, membername);
+        } else {
+            sprintf(pathWithMember, "%s", pathName);
+        }
+        
+        FileSpec file(pathWithMember);
+        ModuleSpec module_spec(file, m_process->GetTarget().GetArchitecture());
+        LLDB_LOGF(log, "Module :%s", pathWithMember);
+        if (ModuleSP module_sp = m_process->GetTarget().GetOrCreateModule(module_spec, true /* notify */)) {
+            UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr2->ldinfo_textorg, false, 1);
+            UpdateLoadedSectionsByType(module_sp, LLDB_INVALID_ADDRESS, (lldb::addr_t)ptr2->ldinfo_dataorg, false, 2);
+            // FIXME: .tdata, .bss
+        }
+        if (ptr2->ldinfo_next == 0) {
+            ptr2 = nullptr;
+        } 
+    }
+}
+
 void DynamicLoaderAIXDYLD::DidAttach() {
   Log *log = GetLog(LLDBLog::DynamicLoader);
   LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
@@ -361,7 +420,8 @@ void DynamicLoaderAIXDYLD::DidLaunch() {
 #endif
 }
 
-Status DynamicLoaderAIXDYLD::CanLoadImage() { return Status(); }
+Status DynamicLoaderAIXDYLD::CanLoadImage() { 
+    return Status(); }
 
 ThreadPlanSP
 DynamicLoaderAIXDYLD::GetStepThroughTrampolinePlan(Thread &thread,
diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
index 0ffbe688e0069..097f8d048b77f 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h
@@ -33,6 +33,9 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
                     lldb::addr_t module_addr);
   void OnUnloadModule(lldb::addr_t module_addr);
 
+  void FillCoreLoaderData(lldb_private::DataExtractor &data,
+          uint64_t loader_offset, uint64_t loader_size); 
+
   void DidAttach() override;
   void DidLaunch() override;
   Status CanLoadImage() override;
@@ -49,6 +52,9 @@ class DynamicLoaderAIXDYLD : public DynamicLoader {
   /// Loads Module from inferior process.
   void ResolveExecutableModule(lldb::ModuleSP &module_sp);
 
+  /// Returns true if the process is for a core file.
+  bool IsCoreFile() const;
+
 private:
   std::map<lldb::ModuleSP, lldb::addr_t> m_loaded_modules;
 };
diff --git a/lldb/source/Plugins/ObjectFile/AIXCore/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/AIXCore/CMakeLists.txt
new file mode 100644
index 0000000000000..5656b33a61726
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/AIXCore/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_lldb_library(lldbPluginObjectFileAIXCore PLUGIN
+    ObjectFileAIXCore.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbSymbol
+    lldbTarget
+    lldbUtility
+    lldbPluginProcessUtility
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
new file mode 100644
index 0000000000000..5158fa4e25077
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
@@ -0,0 +1,254 @@
+//===-- ObjectFileAIXCore.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjectFileAIXCore.h"
+
+#include <algorithm>
+#include <cassert>
+#include <unordered_map>
+#include <string.h>
+
+#include "lldb/Utility/FileSpecList.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Symbol/SymbolContext.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/Stream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(ObjectFileAIXCore)
+
+enum CoreVersion : uint64_t {AIXCORE32 = 0xFEEDDB1, AIXCORE64 = 0xFEEDDB2};
+
+bool m_is_core = false;
+
+// Static methods.
+void ObjectFileAIXCore::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(), CreateInstance,
+                                CreateMemoryInstance, GetModuleSpecifications);
+}
+
+void ObjectFileAIXCore::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+ObjectFile *ObjectFileAIXCore::CreateInstance(const lldb::ModuleSP &module_sp,
+                                          DataBufferSP data_sp,
+                                          lldb::offset_t data_offset,
+                                          const lldb_private::FileSpec *file,
+                                          lldb::offset_t file_offset,
+                                          lldb::offset_t length) {
+
+  if(m_is_core)
+  {
+
+      bool mapped_writable = false;
+      if (!data_sp) {
+          data_sp = MapFileDataWritable(*file, length, file_offset);
+          if (!data_sp)
+              return nullptr;
+          data_offset = 0;
+          mapped_writable = true;
+      }
+
+      assert(data_sp);
+
+      const uint8_t *magic = data_sp->GetBytes() + data_offset;
+
+      // Update the data to contain the entire file if it doesn't already
+      if (data_sp->GetByteSize() < length) {
+          data_sp = MapFileDataWritable(*file, length, file_offset);
+          if (!data_sp)
+              return nullptr;
+          data_offset = 0;
+          mapped_writable = true;
+          magic = data_sp->GetBytes();
+      }
+
+      // If we didn't map the data as writable take ownership of the buffer.
+      if (!mapped_writable) {
+          data_sp = std::make_shared<DataBufferHeap>(data_sp->GetBytes(),
+                  data_sp->GetByteSize());
+          data_offset = 0;
+          magic = data_sp->GetBytes();
+      }
+
+      std::unique_ptr<ObjectFileAIXCore> objfile_up(new ObjectFileAIXCore(
+                  module_sp, data_sp, data_offset, file, file_offset, length));
+      ArchSpec spec = objfile_up->GetArchitecture();
+      objfile_up->SetModulesArchitecture(spec);
+      return objfile_up.release();
+
+  }
+}
+
+ObjectFile *ObjectFileAIXCore::CreateMemoryInstance(
+    const lldb::ModuleSP &module_sp, WritableDataBufferSP data_sp,
+    const lldb::ProcessSP &process_sp, lldb::addr_t header_addr) {
+  return nullptr;
+}
+
+size_t ObjectFileAIXCore::GetModuleSpecifications(
+    const lldb_private::FileSpec &file, lldb::DataBufferSP &data_sp,
+    lldb::offset_t data_offset, lldb::offset_t file_offset,
+    lldb::offset_t length, lldb_private::ModuleSpecList &specs) {
+  const size_t initial_count = specs.GetSize();
+
+  if (ObjectFileAIXCore::MagicBytesMatch(data_sp, 0, data_sp->GetByteSize())) {
+    // Need new ArchType???
+    ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+    ModuleSpec spec(file, arch_spec);
+    spec.GetArchitecture().SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
+    specs.Append(spec);
+  }
+  return specs.GetSize() - initial_count;
+}
+
+static uint32_t AIXCoreHeaderCheckFromMagic(uint32_t magic) {
+
+    Log *log = GetLog(LLDBLog::Modules);
+    switch (magic) {
+        case AIXCORE32: 
+            LLDB_LOGF(log, "ObjectFileAIXCore: 32-bit not supported");
+            break;
+        case AIXCORE64:
+            m_is_core = true;
+            return 1; 
+            break;
+    }
+    return 0;
+}
+
+bool ObjectFileAIXCore::MagicBytesMatch(DataBufferSP &data_sp,
+                                    lldb::addr_t data_offset,
+                                    lldb::addr_t data_length) {
+  lldb_private::DataExtractor data; 
+  data.SetData(data_sp, data_offset, data_length);
+  lldb::offset_t offset = 0;
+  offset += 4; // Skipping to the coredump version
+  uint32_t magic = data.GetU32(&offset);
+  return AIXCoreHeaderCheckFromMagic(magic) != 0;
+}
+
+bool ObjectFileAIXCore::ParseHeader() {
+
+  return false;
+}
+
+ByteOrder ObjectFileAIXCore::GetByteOrder() const {
+  return eByteOrderBig;
+}
+
+bool ObjectFileAIXCore::IsExecutable() const {
+  return false;
+}
+
+uint32_t ObjectFileAIXCore::GetAddressByteSize() const {
+    return 8;
+}
+
+AddressClass ObjectFileAIXCore::GetAddressClass(addr_t file_addr) {
+  return AddressClass::eUnknown;
+}
+
+lldb::SymbolType ObjectFileAIXCore::MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  if (sym_type == llvm::object::SymbolRef::ST_Function)
+    return lldb::eSymbolTypeCode;
+  else if (sym_type == llvm::object::SymbolRef::ST_Data)
+    return lldb::eSymbolTypeData;
+  return lldb::eSymbolTypeInvalid;
+}
+
+void ObjectFileAIXCore::ParseSymtab(Symtab &lldb_symtab) {
+}
+
+bool ObjectFileAIXCore::IsStripped() {
+  return false;
+}
+
+void ObjectFileAIXCore::CreateSections(SectionList &unified_section_list) {
+}
+
+void ObjectFileAIXCore::Dump(Stream *s) {
+}
+
+ArchSpec ObjectFileAIXCore::GetArchitecture() {
+  ArchSpec arch_spec = ArchSpec(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE);
+  return arch_spec;
+}
+
+UUID ObjectFileAIXCore::GetUUID() {
+  return UUID();
+}
+
+uint32_t ObjectFileAIXCore::GetDependentModules(FileSpecList &files) {
+  
+    auto original_size = files.GetSize();
+    return files.GetSize() - original_size;
+}
+
+Address ObjectFileAIXCore::GetImageInfoAddress(Target *target) {
+  return Address();
+}
+
+lldb_private::Address ObjectFileAIXCore::GetBaseAddress() {
+  return lldb_private::Address();
+}
+ObjectFile::Type ObjectFileAIXCore::CalculateType() {
+  return eTypeCoreFile;
+}
+
+ObjectFile::Strata ObjectFileAIXCore::CalculateStrata() {
+  return eStrataUnknown;
+}
+
+std::vector<ObjectFile::LoadableData>
+ObjectFileAIXCore::GetLoadableData(Target &target) {
+  std::vector<LoadableData> loadables;
+  return loadables;
+}
+
+lldb::WritableDataBufferSP
+ObjectFileAIXCore::MapFileDataWritable(const FileSpec &file, uint64_t Size,
+                                   uint64_t Offset) {
+  return FileSystem::Instance().CreateWritableDataBuffer(file.GetPath(), Size,
+                                                         Offset);
+}
+
+ObjectFileAIXCore::ObjectFileAIXCore(const lldb::ModuleSP &module_sp,
+                             DataBufferSP data_sp, lldb::offset_t data_offset,
+                             const FileSpec *file, lldb::offset_t file_offset,
+                             lldb::offset_t length)
+    : ObjectFile(module_sp, file, file_offset, length, data_sp, data_offset)
+      {
+  if (file)
+    m_file = *file;
+}
+
+ObjectFileAIXCore::ObjectFileAIXCore(const lldb::ModuleSP &module_sp,
+                             DataBufferSP header_data_sp,
+                             const lldb::ProcessSP &process_sp,
+                             addr_t header_addr)
+    : ObjectFile(module_sp, process_sp, header_addr, header_data_sp)
+      {
+}
diff --git a/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.h b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.h
new file mode 100644
index 0000000000000..5dbd78d919bb6
--- /dev/null
+++ b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.h
@@ -0,0 +1,121 @@
+//===-- ObjectFileAIXCore.h --------------------------------------- -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_AIXCORE_OBJECTFILEAIXCORE_H
+#define LLDB_SOURCE_PLUGINS_OBJECTFILE_AIXCORE_OBJECTFILEAIXCORE_H
+
+#include <cstdint>
+
+#include <vector>
+
+#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/FileSpec.h"
+#include "lldb/Utility/UUID.h"
+#include "lldb/lldb-private.h"
+#include "llvm/Object/XCOFFObjectFile.h"
+
+/// \class ObjectFileAIXCore
+/// Generic AIX CORE object file reader.
+///
+/// This class provides a generic AIX Core (32/64 bit) reader plugin implementing
+/// the ObjectFile protocol.
+class ObjectFileAIXCore : public lldb_private::ObjectFile {
+public:
+  // Static Functions
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "aix-core-obj"; }
+
+  static llvm::StringRef GetPluginDescriptionStatic() {
+    return "AIX core object file reader.";
+  }
+
+  static lldb_private::ObjectFile *
+  CreateInstance(const lldb::ModuleSP &module_sp, lldb::DataBufferSP data_sp,
+                 lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+                 lldb::offset_t file_offset, lldb::offset_t length);
+
+  static lldb_private::ObjectFile *CreateMemoryInstance(
+      const lldb::ModuleSP &module_sp, lldb::WritableDataBufferSP data_sp,
+      const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
+
+  static size_t GetModuleSpecifications(const lldb_private::FileSpec &file,
+                                        lldb::DataBufferSP &data_sp,
+                                        lldb::offset_t data_offset,
+                                        lldb::offset_t file_offset,
+                                        lldb::offset_t length,
+                                        lldb_private::ModuleSpecList &specs);
+
+  static bool MagicBytesMatch(lldb::DataBufferSP &data_sp, lldb::addr_t offset,
+                              lldb::addr_t length);
+
+  static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type);
+
+  // PluginInterface protocol
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  // ObjectFile Protocol.
+  bool ParseHeader() override;
+  
+  lldb::ByteOrder GetByteOrder() const override;
+
+  bool IsExecutable() const override;
+
+  uint32_t GetAddressByteSize() const override;
+
+  lldb_private::AddressClass GetAddressClass(lldb::addr_t file_addr) override;
+  
+  void ParseSymtab(lldb_private::Symtab &symtab) override;
+
+  bool IsStripped() override;
+
+  void CreateSections(lldb_private::SectionList &unified_section_list) override;
+
+  void Dump(lldb_private::Stream *s) override;
+
+  lldb_private::ArchSpec GetArchitecture() override;
+
+  lldb_private::UUID GetUUID() override;
+
+  uint32_t GetDependentModules(lldb_private::FileSpecList &files) override;
+  
+  lldb_private::Address
+  GetImageInfoAddress(lldb_private::Target *target) override;
+  lldb_private::Address GetBaseAddress() override;
+
+  ObjectFile::Type CalculateType() override;
+
+  ObjectFile::Strata CalculateStrata() override;
+
+  ObjectFileAIXCore(const lldb::ModuleSP &module_sp, lldb::DataBufferSP data_sp,
+                lldb::offset_t data_offset, const lldb_private::FileSpec *file,
+                lldb::offset_t offset, lldb::offset_t length);
+
+  ObjectFileAIXCore(const lldb::ModuleSP &module_sp,
+                lldb::DataBufferSP header_data_sp,
+                const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
+
+protected:
+
+  static bool ParseAIXCoreHeader(lldb_private::DataExtractor &data,
+                              lldb::offset_t *offset_ptr
+                              );
+  
+  std::vector<LoadableData>
+  GetLoadableData(lldb_private::Target &target) override;
+
+  static lldb::WritableDataBufferSP
+  MapFileDataWritable(const lldb_private::FileSpec &file, uint64_t Size,
+                      uint64_t Offset);
+
+};
+
+#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_AIXCORE_OBJECTFILEAIXCORE_H
diff --git a/lldb/source/Plugins/ObjectFile/CMakeLists.txt b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
index 7abd0c96f4fd7..1605356fdb7f1 100644
--- a/lldb/source/Plugins/ObjectFile/CMakeLists.txt
+++ b/lldb/source/Plugins/ObjectFile/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(PECOFF)
 add_subdirectory(XCOFF)
 add_subdirectory(Placeholder)
 add_subdirectory(wasm)
+add_subdirectory(AIXCore)
diff --git a/lldb/source/Plugins/Process/CMakeLists.txt b/lldb/source/Plugins/Process/CMakeLists.txt
index 058b4b9ad2157..0b66ea18c82ce 100644
--- a/lldb/source/Plugins/Process/CMakeLists.txt
+++ b/lldb/source/Plugins/Process/CMakeLists.txt
@@ -24,3 +24,4 @@ add_subdirectory(elf-core)
 add_subdirectory(mach-core)
 add_subdirectory(minidump)
 add_subdirectory(FreeBSDKernel)
+add_subdirectory(aix-core)
diff --git a/lldb/source/Plugins/Process/aix-core/AIXCore.cpp b/lldb/source/Plugins/Process/aix-core/AIXCore.cpp
new file mode 100644
index 0000000000000..95e47b4d8be53
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/AIXCore.cpp
@@ -0,0 +1,116 @@
+//===-- AIXCore.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstring>
+
+#include "lldb/Core/Section.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "lldb/Utility/Stream.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/State.h"
+
+#include "AIXCore.h"
+
+using namespace AIXCORE;
+using namespace lldb;
+using namespace lldb_private;
+
+AIXCore64Header::AIXCore64Header() { memset(this, 0, sizeof(AIXCore64Header)); }
+
+
+bool AIXCore64Header::ParseRegisterContext(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset) {
+    // The data is arranged in this order in this coredump file
+    // so we have to fetch in this exact order. But need to change
+    // the context structure order according to Infos_ppc64
+    for(int i = 0; i < 32; i++)
+        Fault.context.gpr[i] = data.GetU64(offset);
+    Fault.context.msr = data.GetU64(offset); 
+    Fault.context.pc = data.GetU64(offset); 
+    Fault.context.lr = data.GetU64(offset); 
+    Fault.context.ctr = data.GetU64(offset); 
+    Fault.context.cr = data.GetU32(offset); 
+    Fault.context.xer = data.GetU32(offset); 
+    Fault.context.fpscr = data.GetU32(offset); 
+    Fault.context.fpscrx = data.GetU32(offset); 
+    Fault.context.except[0] = data.GetU64(offset); 
+    for(int i = 0; i < 32; i++)
+        Fault.context.fpr[i] = data.GetU64(offset);
+    Fault.context.fpeu = data.GetU8(offset); 
+    Fault.context.fpinfo = data.GetU8(offset); 
+    Fault.context.fpscr24_31 = data.GetU8(offset); 
+    Fault.context.pad[0] = data.GetU8(offset); 
+    Fault.context.excp_type = data.GetU32(offset); 
+
+    return true;
+}
+bool AIXCore64Header::ParseThreadContext(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset) {
+
+    lldb::offset_t offset_to_regctx = *offset; 
+    offset_to_regctx += sizeof(thrdentry64);
+    Fault.thread.ti_tid = data.GetU64(offset);
+    Fault.thread.ti_pid = data.GetU32(offset);
+    int ret = ParseRegisterContext(data, &offset_to_regctx);
+    return true;
+}
+ 
+bool AIXCore64Header::ParseUserData(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset) {
+    User.process.pi_pid = data.GetU32(offset); 
+    User.process.pi_ppid = data.GetU32(offset); 
+    User.process.pi_sid = data.GetU32(offset); 
+    User.process.pi_pgrp = data.GetU32(offset); 
+    User.process.pi_uid = data.GetU32(offset); 
+    User.process.pi_suid = data.GetU32(offset); 
+
+    *offset += 76;
+
+    ByteOrder byteorder = data.GetByteOrder();
+    size_t size = 33;
+    data.ExtractBytes(*offset, size, byteorder, User.process.pi_comm);
+    offset += size;
+
+    return true;
+}
+
+bool AIXCore64Header::ParseCoreHeader(lldb_private::DataExtractor &data,
+                            lldb::offset_t *offset) {
+
+    SignalNum = data.GetU8(offset);  
+    Flag = data.GetU8(offset);  
+    Entries = data.GetU16(offset);  
+    Version = data.GetU32(offset);
+    FDInfo = data.GetU64(offset);
+
+    LoaderOffset = data.GetU64(offset);
+    LoaderSize = data.GetU64(offset);
+    NumberOfThreads = data.GetU32(offset);
+    Reserved0 = data.GetU32(offset);
+    ThreadContextOffset = data.GetU64(offset);
+    NumSegRegion = data.GetU64(offset);
+    SegRegionOffset = data.GetU64(offset);
+    StackOffset = data.GetU64(offset);
+    StackBaseAddr = data.GetU64(offset);
+    StackSize = data.GetU64(offset);
+    DataRegionOffset = data.GetU64(offset);
+    DataBaseAddr = data.GetU64(offset);
+    DataSize = data.GetU64(offset);
+
+    *offset += 104;
+    lldb::offset_t offset_to_user = (*offset + sizeof(ThreadContext64));
+    int ret = 0;
+    ret = ParseThreadContext(data, offset);
+    ret = ParseUserData(data, &offset_to_user);
+
+    return true;
+
+}
+        
diff --git a/lldb/source/Plugins/Process/aix-core/AIXCore.h b/lldb/source/Plugins/Process/aix-core/AIXCore.h
new file mode 100644
index 0000000000000..3d78d5e92c7ab
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/AIXCore.h
@@ -0,0 +1,125 @@
+//===-- AIXCore.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Notes about AIX Process core dumps:
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_AIXCORE_H
+#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_AIXCORE_H
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include <sys/types.h>
+#include <procinfo.h>
+
+namespace AIXCORE {
+
+struct RegContext {
+    // The data is arranged in order as filled by AIXCore.cpp in this coredump file
+    // so we have to fetch in that exact order, refer there. 
+    // But need to change
+    // the context structure in order according to Infos_ppc64
+        uint64_t                gpr[32];    /* 64-bit gprs */
+        unsigned long           pc;            /* msr */
+        unsigned long           msr;            /* iar */
+        unsigned long           origr3;            /* iar */
+        unsigned long           ctr;            /* CTR */
+        unsigned long           lr;             /* LR */
+        unsigned long           xer;            /* XER */
+        unsigned long           cr;             /* CR */
+        unsigned long           softe;             /* CR */
+        unsigned long           trap;             /* CR */
+        unsigned int            fpscr;          /* floating pt status reg */
+        unsigned int            fpscrx;         /* software ext to fpscr */
+        unsigned long           except[1];      /* exception address    */
+        double                  fpr[32];    /* floating pt regs     */
+        char                    fpeu;           /* floating pt ever used */
+        char                    fpinfo;         /* floating pt info     */
+        char                    fpscr24_31;     /* bits 24-31 of 64-bit FPSCR */
+        char                    pad[1];
+        int                     excp_type;      /* exception type       */
+};
+
+    struct ThreadContext64 {
+        struct thrdentry64 thread;
+        struct RegContext context;
+    };
+
+    struct UserData {
+
+        struct procentry64 process;
+        unsigned long long reserved[16];
+    };
+
+    struct AIXCore64Header {
+
+        int8_t   SignalNum;     /* signal number (cause of error) */    
+        int8_t   Flag;      /* flag to describe core dump type */   
+        uint16_t Entries;   /* number of core dump modules */           
+        uint32_t Version;   /* core file format number */           
+        uint64_t FDInfo;  /* offset to fd region in file */
+
+        uint64_t LoaderOffset;    /* offset to loader region in file */
+        uint64_t LoaderSize;     /* size of loader region */
+
+        uint32_t NumberOfThreads ;     /* number of elements in thread table */
+        uint32_t Reserved0; /* Padding                            */
+        uint64_t ThreadContextOffset;       /* offset to thread context table */
+
+        uint64_t NumSegRegion;      /* n of elements in segregion */
+        uint64_t SegRegionOffset; /* offset to start of segregion table */
+
+        uint64_t StackOffset;     /* offset of user stack in file */
+        uint64_t StackBaseAddr;  /* base address of user stack region */
+        uint64_t StackSize;      /* size of user stack region */
+
+        uint64_t DataRegionOffset;      /* offset to user data region */
+        uint64_t DataBaseAddr;   /* base address of user data region */
+        uint64_t DataSize;  /* size of user data region */
+        uint64_t SDataBase;     /* base address of sdata region */
+        uint64_t SDataSize;    /* size of sdata region */
+
+        uint64_t NumVMRegions; /* number of anonymously mapped areas */
+        uint64_t VMOffset;       /* offset to start of vm_infox table */
+
+        int32_t  ProcessorImplementation;      /* processor implementation */
+        uint32_t NumElementsCTX;  /* n of elements in extended ctx table*/
+        uint64_t CPRSOffset;      /* Checkpoint/Restart offset */
+        uint64_t ExtendedContextOffset;    /* extended context offset */
+        uint64_t OffsetUserKey;   /* Offset to user-key exception data */
+        uint64_t OffsetLoaderTLS;   /* offset to the loader region in file
+                                 when a process uses TLS data */
+        uint64_t TLSLoaderSize;    /* size of the above loader region */
+        uint64_t ExtendedProcEntry;   /* Extended procentry64 information */
+        uint64_t Reserved[2];
+
+        struct ThreadContext64 Fault;
+
+        struct UserData User;
+
+        AIXCore64Header();
+
+        bool ParseCoreHeader(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset);
+        bool ParseThreadContext(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset);
+        bool ParseUserData(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset);
+        bool ParseRegisterContext(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset);
+        bool ParseLoaderData(lldb_private::DataExtractor &data,
+                lldb::offset_t *offset);
+
+    };
+
+
+}
+
+#endif // LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_AIXCORE_H
diff --git a/lldb/source/Plugins/Process/aix-core/CMakeLists.txt b/lldb/source/Plugins/Process/aix-core/CMakeLists.txt
new file mode 100644
index 0000000000000..347717a362491
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_definitions("-D_ALL_SOURCE")
+
+add_lldb_library(lldbPluginProcessAIXCore PLUGIN
+  ProcessAIXCore.cpp
+  AIXCore.cpp
+  ThreadAIXCore.cpp
+  RegisterContextCoreAIX_ppc64.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbTarget
+    lldbPluginProcessUtility
+  LINK_COMPONENTS
+    BinaryFormat
+    Support
+  )
diff --git a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
new file mode 100644
index 0000000000000..9300aa14ac4db
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
@@ -0,0 +1,251 @@
+//===-- ProcessAIXCore.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+
+#include <memory>
+#include <mutex>
+
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ModuleSpec.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Core/Section.h"
+#include "lldb/Target/ABI.h"
+#include "lldb/Target/DynamicLoader.h"
+#include "lldb/Target/MemoryRegionInfo.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Target/UnixSignals.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/State.h"
+
+#include "llvm/Support/Threading.h"
+#include "Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.h"
+
+#include "ProcessAIXCore.h"
+#include "AIXCore.h"
+#include "ThreadAIXCore.h"
+
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(ProcessAIXCore)
+
+llvm::StringRef ProcessAIXCore::GetPluginDescriptionStatic() {
+  return "AIX core dump plug-in.";
+}
+
+void ProcessAIXCore::Initialize() {
+  static llvm::once_flag g_once_flag;
+
+  llvm::call_once(g_once_flag, []() {
+    PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                  GetPluginDescriptionStatic(), CreateInstance);
+  });
+}
+
+void ProcessAIXCore::Terminate() {
+  PluginManager::UnregisterPlugin(ProcessAIXCore::CreateInstance);
+}
+
+lldb::ProcessSP ProcessAIXCore::CreateInstance(lldb::TargetSP target_sp,
+                                               lldb::ListenerSP listener_sp,
+                                               const FileSpec *crash_file,
+                                               bool can_connect) {
+  lldb::ProcessSP process_sp;
+  if (crash_file && !can_connect) {
+      const size_t header_size = sizeof(AIXCORE::AIXCore64Header);
+
+      auto data_sp = FileSystem::Instance().CreateDataBuffer(
+              crash_file->GetPath(), header_size, 0);
+
+      if (data_sp && data_sp->GetByteSize() == header_size) {
+          AIXCORE::AIXCore64Header aixcore_header;
+          DataExtractor data(data_sp, lldb::eByteOrderBig, 4);
+          lldb::offset_t data_offset = 0;
+          if(aixcore_header.ParseCoreHeader(data, &data_offset)) {
+              process_sp = std::make_shared<ProcessAIXCore>(target_sp, listener_sp,
+                      *crash_file);
+          }
+      }
+
+  }
+  return process_sp;
+}
+
+// ProcessAIXCore constructor
+ProcessAIXCore::ProcessAIXCore(lldb::TargetSP target_sp,
+                               lldb::ListenerSP listener_sp,
+                               const FileSpec &core_file)
+    : PostMortemProcess(target_sp, listener_sp, core_file) {}
+
+// Destructor
+ProcessAIXCore::~ProcessAIXCore() {
+  Clear();
+  // We need to call finalize on the process before destroying ourselves to
+  // make sure all of the broadcaster cleanup goes as planned. If we destruct
+  // this class, then Process::~Process() might have problems trying to fully
+  // destroy the broadcaster.
+  Finalize(true /* destructing */);
+}
+
+bool ProcessAIXCore::CanDebug(lldb::TargetSP target_sp,
+                                bool plugin_specified_by_name) {
+
+    if (!m_core_module_sp && FileSystem::Instance().Exists(m_core_file)) {
+        ModuleSpec core_module_spec(m_core_file, target_sp->GetArchitecture());
+        Status error(ModuleList::GetSharedModule(core_module_spec, m_core_module_sp,
+                                                 nullptr, nullptr, nullptr));
+        if (m_core_module_sp) {
+            ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
+            if (core_objfile && core_objfile->GetType() == ObjectFile::eTypeCoreFile){
+                return true;
+            }
+        }
+    }
+    return false;
+
+}
+
+ArchSpec ProcessAIXCore::GetArchitecture() {
+
+  ArchSpec arch = m_core_module_sp->GetObjectFile()->GetArchitecture();
+
+  ArchSpec target_arch = GetTarget().GetArchitecture();
+  arch.MergeFrom(target_arch);
+
+  return arch;
+}
+
+lldb_private::DynamicLoader *ProcessAIXCore::GetDynamicLoader() {
+  if (m_dyld_up.get() == nullptr) {
+    m_dyld_up.reset(DynamicLoader::FindPlugin(
+        this, DynamicLoaderAIXDYLD::GetPluginNameStatic()));
+  }
+  return m_dyld_up.get();
+}
+
+void ProcessAIXCore::ParseAIXCoreFile() {
+    
+    Log *log = GetLog(LLDBLog::Process);
+    AIXSigInfo siginfo;
+    ThreadData thread_data;
+    
+    const lldb_private::UnixSignals &unix_signals = *GetUnixSignals();
+    const ArchSpec &arch = GetArchitecture();
+    
+    siginfo.Parse(m_aixcore_header, arch, unix_signals);
+    thread_data.siginfo = siginfo;
+    SetID(m_aixcore_header.User.process.pi_pid);
+    
+    thread_data.name.assign (m_aixcore_header.User.process.pi_comm,
+            strnlen (m_aixcore_header.User.process.pi_comm,
+                sizeof (m_aixcore_header.User.process.pi_comm)));
+    
+    lldb::DataBufferSP data_buffer_sp(new lldb_private::DataBufferHeap(sizeof(m_aixcore_header.Fault.context), 0));
+    
+    memcpy(static_cast<void *>(const_cast<uint8_t *>(data_buffer_sp->GetBytes())),
+            &m_aixcore_header.Fault.context, sizeof(m_aixcore_header.Fault.context));
+    
+    lldb_private::DataExtractor data(data_buffer_sp, lldb::eByteOrderBig, 8);
+
+    thread_data.gpregset = DataExtractor(data, 0, sizeof(m_aixcore_header.Fault.context));
+    m_thread_data.push_back(thread_data);
+    LLDB_LOGF(log, "ProcessAIXCore: Parsing Complete!");
+
+}
+
+// Process Control
+Status ProcessAIXCore::DoLoadCore() {
+    
+    Status error;
+    if (!m_core_module_sp) {
+        error = Status::FromErrorString("invalid core module");
+        return error;
+    }
+
+    FileSpec file = m_core_module_sp->GetObjectFile()->GetFileSpec();
+    
+    if (file) {
+        const size_t header_size = sizeof(AIXCORE::AIXCore64Header);
+        auto data_sp = FileSystem::Instance().CreateDataBuffer(
+                file.GetPath(), -1, 0);
+        if (data_sp && data_sp->GetByteSize() != 0) {
+            
+            DataExtractor data(data_sp, lldb::eByteOrderBig, 4);
+            lldb::offset_t data_offset = 0;
+            m_aixcore_header.ParseCoreHeader(data, &data_offset);
+            auto dyld = static_cast<DynamicLoaderAIXDYLD *>(GetDynamicLoader());
+            dyld->FillCoreLoaderData(data, m_aixcore_header.LoaderOffset,
+                    m_aixcore_header.LoaderSize);
+
+        } else {
+            error = Status::FromErrorString("invalid data");
+            return error;
+        }
+    } else {
+        error = Status::FromErrorString("invalid file");
+        return error;
+    }
+
+    m_thread_data_valid = true;
+    ParseAIXCoreFile();
+    ArchSpec arch(m_core_module_sp->GetArchitecture());
+
+    ArchSpec target_arch = GetTarget().GetArchitecture();
+    ArchSpec core_arch(m_core_module_sp->GetArchitecture());
+    target_arch.MergeFrom(core_arch);
+    GetTarget().SetArchitecture(target_arch);
+    
+    lldb::ModuleSP exe_module_sp = GetTarget().GetExecutableModule();
+    if (!exe_module_sp) {
+        ModuleSpec exe_module_spec;
+        exe_module_spec.GetArchitecture() = arch;
+        exe_module_spec.GetFileSpec().SetFile(m_aixcore_header.User.process.pi_comm,
+                FileSpec::Style::native);
+        exe_module_sp = GetTarget().GetOrCreateModule(exe_module_spec, true);
+        GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo);
+    }
+    
+    return error;
+}
+
+bool ProcessAIXCore::DoUpdateThreadList(ThreadList &old_thread_list,
+                                        ThreadList &new_thread_list) 
+{
+    const ThreadData &td = m_thread_data[0];
+    
+    lldb::ThreadSP thread_sp = 
+        std::make_shared<ThreadAIXCore>(*this, td);
+    new_thread_list.AddThread(thread_sp);
+    
+    return true;
+} 
+
+void ProcessAIXCore::RefreshStateAfterStop() {}
+
+// Process Memory
+size_t ProcessAIXCore::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                                  Status &error) {
+  if (lldb::ABISP abi_sp = GetABI())
+    addr = abi_sp->FixAnyAddress(addr);
+
+  // Don't allow the caching that lldb_private::Process::ReadMemory does since
+  // in core files we have it all cached our our core file anyway.
+  return DoReadMemory(addr, buf, size, error);
+}
+
+size_t ProcessAIXCore::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                                    Status &error) { return 0; }
+
+Status ProcessAIXCore::DoGetMemoryRegionInfo(lldb::addr_t load_addr,
+                                              MemoryRegionInfo &region_info) {
+    return Status();
+}
+
+Status ProcessAIXCore::DoDestroy() { return Status(); }
diff --git a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
new file mode 100644
index 0000000000000..9880c491689ca
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
@@ -0,0 +1,100 @@
+//===-- ProcessAIXCore.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Notes about AIX Process core dumps:
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_PROCESSAIXCORE_H
+#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_PROCESSAIXCORE_H
+
+#include <list>
+#include <vector>
+
+#include "lldb/Target/PostMortemProcess.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Target/Process.h"
+#include "AIXCore.h"
+#include "ThreadAIXCore.h"
+
+struct ThreadData;
+
+class ProcessAIXCore : public lldb_private::PostMortemProcess {
+public:
+  // Constructors and Destructors
+  static lldb::ProcessSP
+  CreateInstance(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
+                 const lldb_private::FileSpec *crash_file_path,
+                 bool can_connect);
+
+  static void Initialize();
+
+  static void Terminate();
+
+  static llvm::StringRef GetPluginNameStatic() { return "aix-core"; }
+
+  static llvm::StringRef GetPluginDescriptionStatic();
+
+  // Constructors and Destructors
+  ProcessAIXCore(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp,
+                 const lldb_private::FileSpec &core_file);
+
+  ~ProcessAIXCore() override;
+
+  // PluginInterface protocol
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+
+  // Process Control
+  lldb_private::Status DoDestroy() override;
+
+  lldb_private::Status WillResume() override {
+    return lldb_private::Status::FromErrorStringWithFormatv(
+        "error: {0} does not support resuming processes", GetPluginName());
+  }
+
+  bool WarnBeforeDetach() const override { return false; }
+
+  lldb_private::ArchSpec GetArchitecture();
+  
+  bool CanDebug(lldb::TargetSP target_sp,
+          bool plugin_specified_by_name) override;
+  
+  // Creating a new process, or attaching to an existing one
+  lldb_private::Status DoLoadCore() override;
+ 
+  bool DoUpdateThreadList(lldb_private::ThreadList &old_thread_list,
+          lldb_private::ThreadList &new_thread_list) override;
+
+  lldb_private::Status
+  DoGetMemoryRegionInfo(lldb::addr_t load_addr,
+              lldb_private::MemoryRegionInfo &region_info) override;
+
+  void RefreshStateAfterStop() override;
+
+  lldb_private::DynamicLoader *GetDynamicLoader() override;
+
+  // Process Memory
+  size_t ReadMemory(lldb::addr_t addr, void *buf, size_t size,
+                    lldb_private::Status &error) override;
+
+  size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
+          lldb_private::Status &error) override; 
+
+  void ParseAIXCoreFile();
+
+
+private:
+  lldb::ModuleSP m_core_module_sp;
+  std::string m_dyld_plugin_name;
+
+  // True if m_thread_contexts contains valid entries
+  bool m_thread_data_valid = false;
+  AIXCORE::AIXCore64Header m_aixcore_header;
+
+  std::vector<ThreadData> m_thread_data;
+};
+
+#endif // LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_PROCESSAIXCORE_H
diff --git a/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.cpp b/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.cpp
new file mode 100644
index 0000000000000..b243017bf9a2a
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.cpp
@@ -0,0 +1,136 @@
+//===-- RegisterContextCoreAIX_ppc64.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegisterContextCoreAIX_ppc64.h"
+
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/DataBufferHeap.h"
+#include "lldb/Utility/RegisterValue.h"
+
+#include "Plugins/Process/Utility/lldb-ppc64le-register-enums.h"
+#include "Plugins/Process/elf-core/RegisterUtilities.h"
+
+#include <memory>
+
+using namespace lldb_private;
+
+RegisterContextCoreAIX_ppc64::RegisterContextCoreAIX_ppc64(
+    Thread &thread, RegisterInfoInterface *register_info,
+    const DataExtractor &gpregset)
+    : RegisterContextPOSIX_ppc64le(thread, 0, register_info) {
+  m_gpr_buffer = std::make_shared<DataBufferHeap>(gpregset.GetDataStart(),
+                                                  gpregset.GetByteSize());
+  m_gpr.SetData(m_gpr_buffer);
+  m_gpr.SetByteOrder(gpregset.GetByteOrder());
+
+  // This Code is for Registers like FPR, VSR, VMX and is disabled right now.
+  // It will be implemented as per need.
+  
+  /*  ArchSpec arch = register_info->GetTargetArchitecture();
+  DataExtractor fpregset;// = getRegset(notes, arch.GetTriple(), FPR_Desc);
+  m_fpr_buffer = std::make_shared<DataBufferHeap>(fpregset.GetDataStart(),
+                                                  fpregset.GetByteSize());
+  m_fpr.SetData(m_fpr_buffer);
+  m_fpr.SetByteOrder(fpregset.GetByteOrder());
+
+  DataExtractor vmxregset;// = getRegset(notes, arch.GetTriple(), PPC_VMX_Desc);
+  m_vmx_buffer = std::make_shared<DataBufferHeap>(vmxregset.GetDataStart(),
+                                                  vmxregset.GetByteSize());
+  m_vmx.SetData(m_vmx_buffer);
+  m_vmx.SetByteOrder(vmxregset.GetByteOrder());
+
+  DataExtractor vsxregset;// = getRegset(notes, arch.GetTriple(), PPC_VSX_Desc);
+  m_vsx_buffer = std::make_shared<DataBufferHeap>(vsxregset.GetDataStart(),
+                                                  vsxregset.GetByteSize());
+  m_vsx.SetData(m_vsx_buffer);
+  m_vsx.SetByteOrder(vsxregset.GetByteOrder());*/
+}
+
+size_t RegisterContextCoreAIX_ppc64::GetFPRSize() const {
+  return k_num_fpr_registers_ppc64le * sizeof(uint64_t);
+}
+
+size_t RegisterContextCoreAIX_ppc64::GetVMXSize() const {
+  return (k_num_vmx_registers_ppc64le - 1) * sizeof(uint64_t) * 2 +
+         sizeof(uint32_t);
+}
+
+size_t RegisterContextCoreAIX_ppc64::GetVSXSize() const {
+  return k_num_vsx_registers_ppc64le * sizeof(uint64_t) * 2;
+}
+
+bool RegisterContextCoreAIX_ppc64::ReadRegister(
+    const RegisterInfo *reg_info, RegisterValue &value) {
+  lldb::offset_t offset = reg_info->byte_offset;
+
+  if (IsFPR(reg_info->kinds[lldb::eRegisterKindLLDB])) {
+    uint64_t v;
+    offset -= GetGPRSize();
+    offset = m_fpr.CopyData(offset, reg_info->byte_size, &v);
+
+    if (offset == reg_info->byte_size) {
+      value.SetBytes(&v, reg_info->byte_size, m_fpr.GetByteOrder());
+      return true;
+    }
+  } else if (IsVMX(reg_info->kinds[lldb::eRegisterKindLLDB])) {
+    uint32_t v[4];
+    offset -= GetGPRSize() + GetFPRSize();
+    offset = m_vmx.CopyData(offset, reg_info->byte_size, &v);
+
+    if (offset == reg_info->byte_size) {
+      value.SetBytes(v, reg_info->byte_size, m_vmx.GetByteOrder());
+      return true;
+    }
+  } else if (IsVSX(reg_info->kinds[lldb::eRegisterKindLLDB])) {
+    uint32_t v[4];
+    lldb::offset_t tmp_offset;
+    offset -= GetGPRSize() + GetFPRSize() + GetVMXSize();
+
+    if (offset < GetVSXSize() / 2) {
+      tmp_offset = m_vsx.CopyData(offset / 2, reg_info->byte_size / 2, &v);
+
+      if (tmp_offset != reg_info->byte_size / 2) {
+        return false;
+      }
+
+      uint8_t *dst = (uint8_t *)&v + sizeof(uint64_t);
+      tmp_offset = m_fpr.CopyData(offset / 2, reg_info->byte_size / 2, dst);
+
+      if (tmp_offset != reg_info->byte_size / 2) {
+        return false;
+      }
+
+      value.SetBytes(&v, reg_info->byte_size, m_vsx.GetByteOrder());
+      return true;
+    } else {
+      offset =
+          m_vmx.CopyData(offset - GetVSXSize() / 2, reg_info->byte_size, &v);
+      if (offset == reg_info->byte_size) {
+        value.SetBytes(v, reg_info->byte_size, m_vmx.GetByteOrder());
+        return true;
+      }
+    }
+  } else {
+    uint64_t v = m_gpr.GetMaxU64(&offset, reg_info->byte_size);
+
+    if (offset == reg_info->byte_offset + reg_info->byte_size) {
+      if (reg_info->byte_size < sizeof(v))
+        value = (uint32_t)v;
+      else
+        value = v;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool RegisterContextCoreAIX_ppc64::WriteRegister(
+    const RegisterInfo *reg_info, const RegisterValue &value) {
+  return false;
+}
diff --git a/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.h b/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.h
new file mode 100644
index 0000000000000..8f1f71ce8d884
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/RegisterContextCoreAIX_ppc64.h
@@ -0,0 +1,46 @@
+//===-- RegisterContextCoreAIX_ppc64.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_REGISTERCONTEXTAIXCORE_PPC64_H
+#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_REGISTERCONTEXTAIXCORE_PPC64_H
+
+#include "Plugins/Process/Utility/RegisterContextPOSIX_ppc64le.h"
+#include "lldb/Utility/DataExtractor.h"
+
+class RegisterContextCoreAIX_ppc64 : public RegisterContextPOSIX_ppc64le {
+public:
+  RegisterContextCoreAIX_ppc64(
+      lldb_private::Thread &thread,
+      lldb_private::RegisterInfoInterface *register_info,
+      const lldb_private::DataExtractor &gpregset);
+
+  bool ReadRegister(const lldb_private::RegisterInfo *reg_info,
+                    lldb_private::RegisterValue &value) override;
+
+  bool WriteRegister(const lldb_private::RegisterInfo *reg_info,
+                     const lldb_private::RegisterValue &value) override;
+
+protected:
+  size_t GetFPRSize() const;
+
+  size_t GetVMXSize() const;
+
+  size_t GetVSXSize() const;
+
+private:
+  lldb::DataBufferSP m_gpr_buffer;
+  lldb::DataBufferSP m_fpr_buffer;
+  lldb::DataBufferSP m_vmx_buffer;
+  lldb::DataBufferSP m_vsx_buffer;
+  lldb_private::DataExtractor m_gpr;
+  lldb_private::DataExtractor m_fpr;
+  lldb_private::DataExtractor m_vmx;
+  lldb_private::DataExtractor m_vsx;
+};
+
+#endif // LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_REGISTERCONTEXTAIXCORE_PPC64_H
diff --git a/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.cpp b/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.cpp
new file mode 100644
index 0000000000000..979e5199fe24d
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.cpp
@@ -0,0 +1,127 @@
+//===-- ThreadAIXCore.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/RegisterContext.h"
+#include "lldb/Target/StopInfo.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Target/UnixSignals.h"
+#include "lldb/Target/Unwind.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/ProcessInfo.h"
+
+#include "Plugins/Process/Utility/RegisterContextPOSIX_powerpc.h"
+#include "Plugins/Process/Utility/RegisterContextPOSIX_ppc64le.h"
+#include "Plugins/Process/Utility/RegisterInfoPOSIX_ppc64le.h"
+#include "Plugins/Process/elf-core/RegisterContextPOSIXCore_powerpc.h"
+#include "RegisterContextCoreAIX_ppc64.h"
+
+#include "ProcessAIXCore.h"
+#include "AIXCore.h"
+#include "ThreadAIXCore.h"
+
+#include <memory>
+#include <iostream>
+
+using namespace lldb;
+using namespace lldb_private;
+
+// Construct a Thread object with given data
+ThreadAIXCore::ThreadAIXCore(Process &process, const ThreadData &td)
+    : Thread(process, td.tid), m_thread_name(td.name), m_thread_reg_ctx_sp(),
+      m_gpregset_data(td.gpregset),
+      m_siginfo(std::move(td.siginfo)) {}
+
+ThreadAIXCore::~ThreadAIXCore() { DestroyThread(); }
+
+void ThreadAIXCore::RefreshStateAfterStop() {
+  GetRegisterContext()->InvalidateIfNeeded(false);
+}
+
+RegisterContextSP ThreadAIXCore::GetRegisterContext() {
+  if (!m_reg_context_sp) {
+    m_reg_context_sp = CreateRegisterContextForFrame(nullptr);
+  }
+  return m_reg_context_sp;
+}
+
+RegisterContextSP
+ThreadAIXCore::CreateRegisterContextForFrame(StackFrame *frame) {
+  RegisterContextSP reg_ctx_sp;
+  uint32_t concrete_frame_idx = 0;
+
+  if (frame)
+    concrete_frame_idx = frame->GetConcreteFrameIndex();
+
+  bool is_linux = false;
+  if (concrete_frame_idx == 0) {
+    if (m_thread_reg_ctx_sp)
+      return m_thread_reg_ctx_sp;
+
+    ProcessAIXCore *process = static_cast<ProcessAIXCore *>(GetProcess().get());
+    ArchSpec arch = process->GetArchitecture();
+    RegisterInfoInterface *reg_interface = nullptr;
+
+    switch (arch.GetMachine()) {
+        case llvm::Triple::ppc64:
+            reg_interface = new RegisterInfoPOSIX_ppc64le(arch);
+            m_thread_reg_ctx_sp = std::make_shared<RegisterContextCoreAIX_ppc64>(
+                    *this, reg_interface, m_gpregset_data);
+            break;
+        default:
+            break;
+    }
+    reg_ctx_sp = m_thread_reg_ctx_sp;
+    } else {
+        reg_ctx_sp = GetUnwinder().CreateRegisterContextForFrame(frame);
+    }
+  return reg_ctx_sp;
+}
+
+bool ThreadAIXCore::CalculateStopInfo() {
+  ProcessSP process_sp(GetProcess());
+  if (!process_sp)
+    return false;
+
+  lldb::UnixSignalsSP unix_signals_sp(process_sp->GetUnixSignals());
+  if (!unix_signals_sp)
+    return false;
+
+  const char *sig_description;
+  std::string description = m_siginfo.GetDescription(*unix_signals_sp);
+  if (description.empty())
+    sig_description = nullptr;
+  else
+    sig_description = description.c_str();
+
+  SetStopInfo(StopInfo::CreateStopReasonWithSignal(
+      *this, m_siginfo.si_signo, sig_description, m_siginfo.si_code));
+
+  SetStopInfo(m_stop_info_sp);
+  return true;
+}
+
+void AIXSigInfo::Parse(const AIXCORE::AIXCore64Header data, const ArchSpec &arch,
+                              const lldb_private::UnixSignals &unix_signals) {
+    si_signo = data.SignalNum;
+    sigfault.si_addr = data.Fault.context.pc;
+}
+
+AIXSigInfo::AIXSigInfo() { memset(this, 0, sizeof(AIXSigInfo)); }
+
+size_t AIXSigInfo::GetSize(const lldb_private::ArchSpec &arch) {
+    return sizeof(AIXSigInfo);
+}
+
+std::string AIXSigInfo::GetDescription(
+    const lldb_private::UnixSignals &unix_signals) const {
+      return unix_signals.GetSignalDescription(si_signo, 0,
+                                              sigfault.si_addr);
+
+}
diff --git a/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.h b/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.h
new file mode 100644
index 0000000000000..9ee157e9b2b9b
--- /dev/null
+++ b/lldb/source/Plugins/Process/aix-core/ThreadAIXCore.h
@@ -0,0 +1,110 @@
+//===-- ThreadAIXCore.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_THREADAIXCORE_H
+#define LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_THREADAIXCORE_H
+
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "llvm/ADT/DenseMap.h"
+#include <optional>
+#include <string>
+#include "ProcessAIXCore.h"
+#include "AIXCore.h"
+#include "ThreadAIXCore.h"
+
+namespace lldb_private {
+class ProcessInstanceInfo;
+}
+
+struct AIXSigInfo {
+ 
+  //COPY siginfo_t correctly for AIX version
+  int32_t si_signo; // Order matters for the first 3.
+  int32_t si_errno;
+  int32_t si_code;
+  struct alignas(8) {
+    lldb::addr_t si_addr; 
+    int16_t si_addr_lsb;
+    union {
+      struct {
+        lldb::addr_t _lower;
+        lldb::addr_t _upper;
+      } _addr_bnd;
+      uint32_t _pkey;
+    } bounds;
+  } sigfault;
+
+  enum SigInfoNoteType : uint8_t { eUnspecified, eNT_SIGINFO };
+  SigInfoNoteType note_type;
+
+  AIXSigInfo();
+
+  void Parse(const AIXCORE::AIXCore64Header data,
+                             const lldb_private::ArchSpec &arch,
+                             const lldb_private::UnixSignals &unix_signals);
+
+  std::string
+  GetDescription(const lldb_private::UnixSignals &unix_signals) const;
+
+  static size_t GetSize(const lldb_private::ArchSpec &arch);
+};
+
+struct ThreadData {
+  lldb_private::DataExtractor gpregset;
+  std::vector<lldb_private::DataExtractor> notes;
+  lldb::tid_t tid;
+  std::string name;
+  AIXSigInfo siginfo;
+  int prstatus_sig = 0;
+};
+
+class ThreadAIXCore : public lldb_private::Thread {
+public:
+  ThreadAIXCore(lldb_private::Process &process, const ThreadData &td);
+
+  ~ThreadAIXCore() override;
+
+  void RefreshStateAfterStop() override;
+
+  lldb::RegisterContextSP GetRegisterContext() override;
+
+  lldb::RegisterContextSP
+  CreateRegisterContextForFrame(lldb_private::StackFrame *frame) override;
+
+  static bool ThreadIDIsValid(lldb::tid_t thread) { return thread != 0; }
+
+  const char *GetName() override {
+    if (m_thread_name.empty())
+      return nullptr;
+    return m_thread_name.c_str();
+  }
+
+  void SetName(const char *name) override {
+    if (name && name[0])
+      m_thread_name.assign(name);
+    else
+      m_thread_name.clear();
+  }
+
+  void CreateStopFromSigInfo(const AIXSigInfo &siginfo,
+                             const lldb_private::UnixSignals &unix_signals);
+
+protected:
+  // Member variables.
+  std::string m_thread_name;
+  lldb::RegisterContextSP m_thread_reg_ctx_sp;
+
+  lldb_private::DataExtractor m_gpregset_data;
+  std::vector<lldb_private::DataExtractor> m_notes;
+  AIXSigInfo m_siginfo;
+
+  bool CalculateStopInfo() override;
+};
+
+#endif // LLDB_SOURCE_PLUGINS_PROCESS_AIX_CORE_THREADAIXCORE_H

From 57cb8058646103eeada1f92e039d9c54ccd4788f Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 17 Mar 2025 07:31:26 -0500
Subject: [PATCH 40/58] Merge branch 'llvm:main' into llvmgh-101657

---
 lldb/cmake/modules/LLDBConfig.cmake | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 312791ce36fc7..9df71edd8b359 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -292,11 +292,7 @@ endif()
 
 # Figure out if lldb could use lldb-server.  If so, then we'll
 # ensure we build lldb-server when an lldb target is being built.
-<<<<<<< HEAD
-if (CMAKE_SYSTEM_NAME MATCHES "Android|Darwin|FreeBSD|Linux|NetBSD|OpenBSD|Windows|AIX")
-=======
 if (CMAKE_SYSTEM_NAME MATCHES "AIX|Android|Darwin|FreeBSD|Linux|NetBSD|OpenBSD|Windows")
->>>>>>> upstream/main
   set(LLDB_CAN_USE_LLDB_SERVER ON)
 else()
   set(LLDB_CAN_USE_LLDB_SERVER OFF)

From 0767ef036d3f82d1429e04a319feb6627ea08158 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Tue, 18 Mar 2025 15:03:17 +0530
Subject: [PATCH 41/58] Error Handling (#32)

* Error Handling for aix core module
---
 .../AIX-DYLD/DynamicLoaderAIXDYLD.cpp             | 12 +++++++-----
 .../ObjectFile/AIXCore/ObjectFileAIXCore.cpp      | 15 +++++++--------
 lldb/source/Plugins/Process/aix-core/AIXCore.cpp  |  2 +-
 .../Plugins/Process/aix-core/ProcessAIXCore.cpp   | 11 ++++++++---
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
index 7e44ffbb1c051..12f24c049f373 100644
--- a/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/AIX-DYLD/DynamicLoaderAIXDYLD.cpp
@@ -235,17 +235,19 @@ bool DynamicLoaderAIXDYLD::IsCoreFile() const {
 void DynamicLoaderAIXDYLD::FillCoreLoaderData(lldb_private::DataExtractor &data,
         uint64_t loader_offset, uint64_t loader_size ) {
     
+    Log *log = GetLog(LLDBLog::DynamicLoader);
+    LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
     static char *buffer = (char *)malloc(loader_size);
-    struct ld_info ldinfo[64];
-    char *buffer_complete;
+    if (buffer == NULL) {
+        LLDB_LOG(log, "Buffer allocation failed error: {0}", std::strerror(errno));
+        return;
+    }
+    struct ld_info ldinfo[64] = {};
     struct ld_info *ptr;
     int i = 0;
     
-    Log *log = GetLog(LLDBLog::DynamicLoader);
-    LLDB_LOGF(log, "DynamicLoaderAIXDYLD::%s()", __FUNCTION__);
     ByteOrder byteorder = data.GetByteOrder();
     data.ExtractBytes(loader_offset, loader_size, eByteOrderBig, buffer);
-    buffer_complete = buffer + loader_size;
     ldinfo[0].ldinfo_next = 1;
     
     while (ldinfo[i++].ldinfo_next != 0) {
diff --git a/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
index 5158fa4e25077..0ba1056866937 100644
--- a/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
+++ b/lldb/source/Plugins/ObjectFile/AIXCore/ObjectFileAIXCore.cpp
@@ -73,8 +73,6 @@ ObjectFile *ObjectFileAIXCore::CreateInstance(const lldb::ModuleSP &module_sp,
 
       assert(data_sp);
 
-      const uint8_t *magic = data_sp->GetBytes() + data_offset;
-
       // Update the data to contain the entire file if it doesn't already
       if (data_sp->GetByteSize() < length) {
           data_sp = MapFileDataWritable(*file, length, file_offset);
@@ -82,7 +80,6 @@ ObjectFile *ObjectFileAIXCore::CreateInstance(const lldb::ModuleSP &module_sp,
               return nullptr;
           data_offset = 0;
           mapped_writable = true;
-          magic = data_sp->GetBytes();
       }
 
       // If we didn't map the data as writable take ownership of the buffer.
@@ -90,7 +87,6 @@ ObjectFile *ObjectFileAIXCore::CreateInstance(const lldb::ModuleSP &module_sp,
           data_sp = std::make_shared<DataBufferHeap>(data_sp->GetBytes(),
                   data_sp->GetByteSize());
           data_offset = 0;
-          magic = data_sp->GetBytes();
       }
 
       std::unique_ptr<ObjectFileAIXCore> objfile_up(new ObjectFileAIXCore(
@@ -124,19 +120,22 @@ size_t ObjectFileAIXCore::GetModuleSpecifications(
   return specs.GetSize() - initial_count;
 }
 
-static uint32_t AIXCoreHeaderCheckFromMagic(uint32_t magic) {
+static bool AIXCoreHeaderCheckFromMagic(uint32_t magic) {
 
     Log *log = GetLog(LLDBLog::Modules);
+    bool ret = false;
     switch (magic) {
         case AIXCORE32: 
             LLDB_LOGF(log, "ObjectFileAIXCore: 32-bit not supported");
             break;
         case AIXCORE64:
             m_is_core = true;
-            return 1; 
+            ret = true; 
+            break;
+        default:
             break;
     }
-    return 0;
+    return ret;
 }
 
 bool ObjectFileAIXCore::MagicBytesMatch(DataBufferSP &data_sp,
@@ -147,7 +146,7 @@ bool ObjectFileAIXCore::MagicBytesMatch(DataBufferSP &data_sp,
   lldb::offset_t offset = 0;
   offset += 4; // Skipping to the coredump version
   uint32_t magic = data.GetU32(&offset);
-  return AIXCoreHeaderCheckFromMagic(magic) != 0;
+  return AIXCoreHeaderCheckFromMagic(magic);
 }
 
 bool ObjectFileAIXCore::ParseHeader() {
diff --git a/lldb/source/Plugins/Process/aix-core/AIXCore.cpp b/lldb/source/Plugins/Process/aix-core/AIXCore.cpp
index 95e47b4d8be53..bc496b5af273f 100644
--- a/lldb/source/Plugins/Process/aix-core/AIXCore.cpp
+++ b/lldb/source/Plugins/Process/aix-core/AIXCore.cpp
@@ -110,7 +110,7 @@ bool AIXCore64Header::ParseCoreHeader(lldb_private::DataExtractor &data,
     ret = ParseThreadContext(data, offset);
     ret = ParseUserData(data, &offset_to_user);
 
-    return true;
+    return ret;
 
 }
         
diff --git a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
index 9300aa14ac4db..cfcbe1a216116 100644
--- a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
+++ b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
@@ -208,8 +208,10 @@ Status ProcessAIXCore::DoLoadCore() {
         exe_module_spec.GetArchitecture() = arch;
         exe_module_spec.GetFileSpec().SetFile(m_aixcore_header.User.process.pi_comm,
                 FileSpec::Style::native);
-        exe_module_sp = GetTarget().GetOrCreateModule(exe_module_spec, true);
-        GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo);
+        exe_module_sp = 
+            GetTarget().GetOrCreateModule(exe_module_spec, true /* notify */);
+        if (exe_module_sp)
+            GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo);
     }
     
     return error;
@@ -232,8 +234,11 @@ void ProcessAIXCore::RefreshStateAfterStop() {}
 // Process Memory
 size_t ProcessAIXCore::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
                                   Status &error) {
+  if(addr == LLDB_INVALID_ADDRESS)
+      return 0;
+
   if (lldb::ABISP abi_sp = GetABI())
-    addr = abi_sp->FixAnyAddress(addr);
+      addr = abi_sp->FixAnyAddress(addr);
 
   // Don't allow the caching that lldb_private::Process::ReadMemory does since
   // in core files we have it all cached our our core file anyway.

From 8214e5d8cc52b486d3c3f5f4615848b1782cfcf8 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 26 Mar 2025 01:34:36 -0500
Subject: [PATCH 42/58] Merge branch 'llvm:main' into gh-101657

---
 lldb/source/Host/common/Host.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 575bb46216d19..6cf1112511c3f 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -20,7 +20,6 @@
 #include <spawn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
-#endif
 
 #include <unistd.h>
 #endif

From d00d28ae68c731efe6f9392000be9822ee74ff0a Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Wed, 26 Mar 2025 04:08:23 -0500
Subject: [PATCH 43/58] First time attach resolution

---
 .../Plugins/Process/AIX/NativeProcessAIX.cpp       | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
index fc84763857453..83b8ae5eb3258 100644
--- a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -2000,7 +2000,21 @@ Status NativeProcessAIX::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
     } else if (req == PT_WRITE_BLOCK) {
       ptrace64(req, pid, (long long)addr, (int)data_size, (int *)result);
     } else if (req == PT_ATTACH) {
+      // Block SIGCHLD signal during attach to the process, 
+      // to prevent interruptions.
+      // The ptrace operation may send SIGCHLD signals in certain cases 
+      // during the attach, which can interfere. 
+      static sigset_t signal_set;
+      sigemptyset (&signal_set);
+      sigaddset (&signal_set, SIGCHLD);
+      if(!pthread_sigmask( SIG_BLOCK, &signal_set,  NULL))
+        LLDB_LOG(log,"NativeProcessAIX::pthread_sigmask(SIG_BLOCK) Failed");
+      
       ptrace64(req, pid, 0, 0, nullptr);
+      
+      //Unblocking the SIGCHLD after attach work. 
+      if(!pthread_sigmask( SIG_UNBLOCK, &signal_set, NULL )) 
+        LLDB_LOG(log,"NativeProcessAIX::pthread_sigmask(SIG_UNBLOCK) Failed");
     } else if (req == PT_WATCH) {
       ptrace64(req, pid, (long long)addr, (int)data_size, nullptr);
     } else if (req == PT_DETACH) {

From 9ff945e62a906e9711022bf8f77b86a0aa05c64e Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Tue, 1 Apr 2025 04:54:51 -0500
Subject: [PATCH 44/58] Invalid DWARF rangelist

---
 lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index cf11e5fb8f5a3..65bd1ee9781d8 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -578,6 +578,7 @@ SectionType ObjectFileXCOFF::GetSectionType(llvm::StringRef sect_name,
       .Case(".dwinfo", eSectionTypeDWARFDebugInfo)
       .Case(".dwline", eSectionTypeDWARFDebugLine)
       .Case(".dwabrev", eSectionTypeDWARFDebugAbbrev)
+      .Case(".dwrnges", eSectionTypeDWARFDebugRanges)
       .Default(eSectionTypeInvalid);
 
     if (section_type != eSectionTypeInvalid)

From b443dd5b26354da73cd4d785a093d9d1582b25a8 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 2 Apr 2025 11:57:28 +0530
Subject: [PATCH 45/58] Fix for stack memory access from core file (#40)

* Fix for stack memory access in core file
---
 .../Process/aix-core/ProcessAIXCore.cpp       | 74 ++++++++++++++++++-
 .../Plugins/Process/aix-core/ProcessAIXCore.h | 11 +++
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
index cfcbe1a216116..bb2db66e2980e 100644
--- a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
+++ b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.cpp
@@ -94,6 +94,33 @@ ProcessAIXCore::~ProcessAIXCore() {
   Finalize(true /* destructing */);
 }
 
+lldb::addr_t ProcessAIXCore::AddAddressRanges(AIXCORE::AIXCore64Header header) {
+  const lldb::addr_t addr = header.StackBaseAddr;
+  FileRange file_range(header.StackOffset, header.StackSize);
+  VMRangeToFileOffset::Entry range_entry(addr, header.StackSize, file_range);
+
+  if (header.StackSize > 0) {
+    VMRangeToFileOffset::Entry *last_entry = m_core_aranges.Back();
+    if (last_entry &&
+        last_entry->GetRangeEnd() == range_entry.GetRangeBase() &&
+        last_entry->data.GetRangeEnd() == range_entry.data.GetRangeBase() &&
+        last_entry->GetByteSize() == last_entry->data.GetByteSize()) {
+        last_entry->SetRangeEnd(range_entry.GetRangeEnd());
+        last_entry->data.SetRangeEnd(range_entry.data.GetRangeEnd());
+    } else {
+        m_core_aranges.Append(range_entry);
+    }
+  }
+
+  const uint32_t permissions = lldb::ePermissionsReadable |
+      lldb::ePermissionsWritable;
+
+  m_core_range_infos.Append(
+      VMRangeToPermissions::Entry(addr, header.StackSize, permissions));
+
+  return addr;
+}
+
 bool ProcessAIXCore::CanDebug(lldb::TargetSP target_sp,
                                 bool plugin_specified_by_name) {
 
@@ -170,6 +197,7 @@ Status ProcessAIXCore::DoLoadCore() {
     }
 
     FileSpec file = m_core_module_sp->GetObjectFile()->GetFileSpec();
+    Log *log = GetLog(LLDBLog::Process);
     
     if (file) {
         const size_t header_size = sizeof(AIXCORE::AIXCore64Header);
@@ -180,6 +208,9 @@ Status ProcessAIXCore::DoLoadCore() {
             DataExtractor data(data_sp, lldb::eByteOrderBig, 4);
             lldb::offset_t data_offset = 0;
             m_aixcore_header.ParseCoreHeader(data, &data_offset);
+            lldb::addr_t addr = AddAddressRanges(m_aixcore_header);
+            if (addr == LLDB_INVALID_ADDRESS)
+                LLDB_LOGF(log, "ProcessAIXCore: Invalid base address. Stack information will be limited");
             auto dyld = static_cast<DynamicLoaderAIXDYLD *>(GetDynamicLoader());
             dyld->FillCoreLoaderData(data, m_aixcore_header.LoaderOffset,
                     m_aixcore_header.LoaderSize);
@@ -246,7 +277,48 @@ size_t ProcessAIXCore::ReadMemory(lldb::addr_t addr, void *buf, size_t size,
 }
 
 size_t ProcessAIXCore::DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
-                                    Status &error) { return 0; }
+                                    Status &error) {
+    ObjectFile *core_objfile = m_core_module_sp->GetObjectFile();
+    if (core_objfile == nullptr)
+        return 0;
+    // Get the address range
+    const VMRangeToFileOffset::Entry *address_range =
+        m_core_aranges.FindEntryThatContains(addr);
+    if (address_range == nullptr || address_range->GetRangeEnd() < addr) {
+        error = Status::FromErrorStringWithFormat(
+                "core file does not contain 0x%" PRIx64, addr);
+        return 0;
+    }
+
+    // Convert the address into core file offset
+    const lldb::addr_t offset = addr - address_range->GetRangeBase();
+    const lldb::addr_t file_start = address_range->data.GetRangeBase();
+    const lldb::addr_t file_end = address_range->data.GetRangeEnd();
+    size_t bytes_to_read = size; // Number of bytes to read from the core file
+    size_t bytes_copied = 0;   // Number of bytes actually read from the core file
+    // Number of bytes available in the core file from the given address
+    lldb::addr_t bytes_left = 0;
+
+    // Don't proceed if core file doesn't contain the actual data for this
+    // address range.
+    if (file_start == file_end)
+        return 0;
+
+    // Figure out how many on-disk bytes remain in this segment starting at the
+    // given offset
+    if (file_end > file_start + offset)
+        bytes_left = file_end - (file_start + offset);
+
+    if (bytes_to_read > bytes_left)
+        bytes_to_read = bytes_left;
+
+  // If there is data available on the core file read it
+  if (bytes_to_read)
+    bytes_copied =
+        core_objfile->CopyData(offset + file_start, bytes_to_read, buf);
+
+  return bytes_copied;
+}
 
 Status ProcessAIXCore::DoGetMemoryRegionInfo(lldb::addr_t load_addr,
                                               MemoryRegionInfo &region_info) {
diff --git a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
index 9880c491689ca..ffd9e401ee192 100644
--- a/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
+++ b/lldb/source/Plugins/Process/aix-core/ProcessAIXCore.h
@@ -85,11 +85,22 @@ class ProcessAIXCore : public lldb_private::PostMortemProcess {
 
   void ParseAIXCoreFile();
 
+  lldb::addr_t AddAddressRanges(AIXCORE::AIXCore64Header header); 
 
 private:
   lldb::ModuleSP m_core_module_sp;
   std::string m_dyld_plugin_name;
 
+  typedef lldb_private::Range<lldb::addr_t, lldb::addr_t> FileRange;
+  typedef lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, FileRange>
+      VMRangeToFileOffset;
+  typedef lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, uint32_t>
+      VMRangeToPermissions;
+
+  // Address ranges found in the core
+  VMRangeToFileOffset m_core_aranges;
+  VMRangeToPermissions m_core_range_infos;
+
   // True if m_thread_contexts contains valid entries
   bool m_thread_data_valid = false;
   AIXCORE::AIXCore64Header m_aixcore_header;

From 96db5e3257436a222ee38049528d682166421fa1 Mon Sep 17 00:00:00 2001
From: Dhruv-Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 2 Apr 2025 01:46:46 -0500
Subject: [PATCH 46/58] Build fail: SBMutex

---
 lldb/source/API/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index 0045edf6743d1..2c8f2d583c054 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -83,6 +83,7 @@ add_lldb_library(liblldb STATIC ${option_framework}
   SBMemoryRegionInfoList.cpp
   SBModule.cpp
   SBModuleSpec.cpp
+  SBMutex.cpp
   SBPlatform.cpp
   SBProcess.cpp
   SBProgress.cpp

From d7a892ef207cde5430021b8705279cc08dc4e0f7 Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Wed, 30 Apr 2025 04:53:28 -0500
Subject: [PATCH 47/58] Added change for step command issue

---
 .../Plugins/Process/AIX/NativeProcessAIX.cpp  | 87 +++++++++++++------
 1 file changed, 61 insertions(+), 26 deletions(-)

diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
index 83b8ae5eb3258..ace9e11927bee 100644
--- a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -19,7 +19,7 @@
 #include <sstream>
 #include <string>
 #include <unordered_map>
-
+#include <sys/procfs.h>
 #include "NativeThreadAIX.h"
 #include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
 //#include "Plugins/Process/Utility/LinuxProcMaps.h"
@@ -79,6 +79,7 @@ using namespace lldb_private;
 using namespace lldb_private::process_aix;
 using namespace llvm;
 
+typedef std::function<bool(llvm::Expected<MemoryRegionInfo>)> AIXMapCallback;
 // Private bits we only need internally.
 
 static bool ProcessVmReadvSupported() {
@@ -988,13 +989,6 @@ Status NativeProcessAIX::GetMemoryRegionInfo(lldb::addr_t load_addr,
   for (auto it = m_mem_region_cache.begin(); it != m_mem_region_cache.end();
        ++it) {
     MemoryRegionInfo &proc_entry_info = it->first;
-
-    // Sanity check assumption that /proc/{pid}/maps entries are ascending.
-    assert((proc_entry_info.GetRange().GetRangeBase() >= prev_base_address) &&
-           "descending /proc/pid/maps entries detected, unexpected");
-    prev_base_address = proc_entry_info.GetRange().GetRangeBase();
-    UNUSED_IF_ASSERT_DISABLED(prev_base_address);
-
     // If the target address comes before this entry, indicate distance to next
     // region.
     if (load_addr < proc_entry_info.GetRange().GetRangeBase()) {
@@ -1029,9 +1023,59 @@ Status NativeProcessAIX::GetMemoryRegionInfo(lldb::addr_t load_addr,
   return error;
 }
 
+// Parsing the AIX map file /proc/PID/map
+// The map file contains an array of prmap structures
+// which has all the information like size, startaddress, object name, permissions
+bool ParseAIXMapRegions(const char *aix_map, AIXMapCallback const &callback) {
+  MemoryRegionInfo region;
+  struct prmap *prmapData = (struct prmap *)aix_map;
+  struct prmap *entry;
+  uint32_t perm_flag;
+
+  for(entry = prmapData;!(entry->pr_size == 0 && entry->pr_vaddr == NULL); entry++) {
+    const char *o_name = aix_map + entry->pr_pathoff;  
+    lldb::addr_t start_address = (lldb::addr_t )entry->pr_vaddr;
+    lldb::addr_t end_address = start_address + entry->pr_size; 
+    region.GetRange().SetRangeBase(start_address);
+    region.GetRange().SetRangeEnd(end_address);
+    region.SetMapped(MemoryRegionInfo::OptionalBool::eYes);
+    perm_flag = entry->pr_mflags;
+    
+    if(perm_flag & MA_READ)
+        region.SetReadable(MemoryRegionInfo::OptionalBool::eYes);
+    else
+        region.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
+
+   if(perm_flag & MA_WRITE)
+        region.SetWritable(MemoryRegionInfo::OptionalBool::eYes);
+   else
+        region.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
+
+   if(perm_flag & MA_EXEC)
+       region.SetExecutable(MemoryRegionInfo::OptionalBool::eYes);
+   else
+       region.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
+
+   if((perm_flag & MA_SLIBTEXT) || (perm_flag & MA_SLIBDATA))
+       region.SetShared(MemoryRegionInfo::OptionalBool::eYes);
+   else if ((perm_flag & MA_PLIBTEXT) || (perm_flag & MA_PLIBDATA))
+       region.SetShared(MemoryRegionInfo::OptionalBool::eNo);
+   else 
+       region.SetShared(MemoryRegionInfo::OptionalBool::eDontKnow);
+
+   if(o_name)
+        region.SetName(o_name);
+
+    callback(region);
+    region.Clear();
+  }
+
+  return true;
+}
+
+
 Status NativeProcessAIX::PopulateMemoryRegionCache() {
   Log *log = GetLog(POSIXLog::Process);
-
   // If our cache is empty, pull the latest.  There should always be at least
   // one memory region if memory region handling is supported.
   if (!m_mem_region_cache.empty()) {
@@ -1041,7 +1085,7 @@ Status NativeProcessAIX::PopulateMemoryRegionCache() {
   }
 
   Status Result;
-#if 0
+
   AIXMapCallback callback = [&](llvm::Expected<MemoryRegionInfo> Info) {
     if (Info) {
       FileSpec file_spec(Info->GetName().GetCString());
@@ -1050,26 +1094,17 @@ Status NativeProcessAIX::PopulateMemoryRegionCache() {
       return true;
     }
 
-    Result = Info.takeError();
+    Result = Status::FromError(Info.takeError());
     m_supports_mem_region = LazyBool::eLazyBoolNo;
     LLDB_LOG(log, "failed to parse proc maps: {0}", Result);
     return false;
   };
 
-  // AIX kernel since 2.6.14 has /proc/{pid}/smaps
-  // if CONFIG_PROC_PAGE_MONITOR is enabled
-  auto BufferOrError = getProcFile(GetID(), GetCurrentThreadID(), "smaps");
-  if (BufferOrError)
-    ParseAIXSMapRegions(BufferOrError.get()->getBuffer(), callback);
-  else {
-    BufferOrError = getProcFile(GetID(), GetCurrentThreadID(), "maps");
-    if (!BufferOrError) {
-      m_supports_mem_region = LazyBool::eLazyBoolNo;
-      return BufferOrError.getError();
-    }
-
-    ParseAIXMapRegions(BufferOrError.get()->getBuffer(), callback);
-  }
+  auto BufferOrError = getProcFile(GetID(), "map");
+  if (BufferOrError) {
+    std::unique_ptr<llvm::MemoryBuffer> MapBuffer = std::move(*BufferOrError);
+    ParseAIXMapRegions(MapBuffer->getBufferStart(), callback);
+  } 
 
   if (Result.Fail())
     return Result;
@@ -1090,7 +1125,7 @@ Status NativeProcessAIX::PopulateMemoryRegionCache() {
 
   // We support memory retrieval, remember that.
   m_supports_mem_region = LazyBool::eLazyBoolYes;
-#endif
+
   return Status();
 }
 

From 09e392a16dba25a5a8ddccf7893289a0a8798a3b Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Wed, 14 May 2025 00:52:06 -0500
Subject: [PATCH 48/58] Removing netbsd license dependency

---
 lldb/source/Host/common/Host.cpp | 165 ++-----------------------------
 1 file changed, 7 insertions(+), 158 deletions(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 6cf1112511c3f..7444163366fe5 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -349,168 +349,16 @@ bool Host::ResolveExecutableInBundle(FileSpec &file) { return false; }
 #include <stdio.h>
 extern char **p_xargv;
 
-/* Fix missing Dl_info & dladdr in AIX
- * The code is taken from netbsd.org (src/crypto/external/bsd/openssl/dist/crypto/dso/dso_dlfcn.c)
- * except strlcpy & strlcat (those are taken from openbsd.org (src/lib/libc/string))
- */
-/*-
- * See IBM's AIX Version 7.2, Technical Reference:
- *  Base Operating System and Extensions, Volume 1 and 2
- *  https://www.ibm.com/support/knowledgecenter/ssw_aix_72/com.ibm.aix.base/technicalreferences.htm
- */
-#include <sys/ldr.h>
-#include <errno.h>
-
-/* strlcpy:
- * Copy string src to buffer dst of size dsize.  At most dsize-1
- * chars will be copied.  Always NUL terminates (unless dsize == 0).
- * Returns strlen(src); if retval >= dsize, truncation occurred.
- */
-size_t strlcpy(char *dst, const char *src, size_t dsize)
-{
-    const char *osrc = src;
-    size_t nleft = dsize;
-
-    /* Copy as many bytes as will fit. */
-    if (nleft != 0) {
-        while (--nleft != 0) {
-            if ((*dst++ = *src++) == '\0') {
-                break;
-            }
-        }
-    }
-
-    /* Not enough room in dst, add NUL and traverse rest of src. */
-    if (nleft == 0) {
-        if (dsize != 0) {
-            *dst = '\0';		/* NUL-terminate dst */
-        }
-        while (*src++) {
-            ;
-        }
-    }
-
-    return src - osrc - 1;	/* count does not include NUL */
-}
-
-/* strlcat:
- * Appends src to string dst of size dsize (unlike strncat, dsize is the
- * full size of dst, not space left).  At most dsize-1 characters
- * will be copied.  Always NUL terminates (unless dsize <= strlen(dst)).
- * Returns strlen(src) + MIN(dsize, strlen(initial dst)).
- * If retval >= dsize, truncation occurred.
- */
-size_t strlcat(char *dst, const char *src, size_t dsize)
-{
-    const char *odst = dst;
-    const char *osrc = src;
-    size_t n = dsize;
-    size_t dlen;
-
-    /* Find the end of dst and adjust bytes left but don't go past end. */
-    while (n-- != 0 && *dst != '\0') {
-        dst++;
-    }
-    dlen = dst - odst;
-    n = dsize - dlen;
-
-    if (n-- == 0) {
-        return dlen + strlen(src);
-    }
-    while (*src != '\0') {
-        if (n != 0) {
-            *dst++ = *src;
-            n--;
-        }
-        src++;
-    }
-    *dst = '\0';
-
-    return dlen + src - osrc;	/* count does not include NUL */
-}
-
-/* ~ 64 * (sizeof(struct ld_info) + _XOPEN_PATH_MAX + _XOPEN_NAME_MAX) */
-#  define DLFCN_LDINFO_SIZE 86976
-typedef struct Dl_info {
-    const char *dli_fname;
-} Dl_info;
-/*
- * This dladdr()-implementation will also find the ptrgl (Pointer Glue) virtual
- * address of a function, which is just located in the DATA segment instead of
- * the TEXT segment.
- */
-static int dladdr(const void *ptr, Dl_info *dl)
-{
-    uintptr_t addr = (uintptr_t)ptr;
-    struct ld_info *ldinfos;
-    struct ld_info *next_ldi;
-    struct ld_info *this_ldi;
-
-    if ((ldinfos = (struct ld_info *)malloc(DLFCN_LDINFO_SIZE)) == NULL) {
-        dl->dli_fname = NULL;
-        return 0;
-    }
-
-    if ((loadquery(L_GETINFO, (void *)ldinfos, DLFCN_LDINFO_SIZE)) < 0) {
-        /*-
-         * Error handling is done through errno and dlerror() reading errno:
-         *  ENOMEM (ldinfos buffer is too small),
-         *  EINVAL (invalid flags),
-         *  EFAULT (invalid ldinfos ptr)
-         */
-        free((void *)ldinfos);
-        dl->dli_fname = NULL;
-        return 0;
-    }
-    next_ldi = ldinfos;
-
-    do {
-        this_ldi = next_ldi;
-        if (((addr >= (uintptr_t)this_ldi->ldinfo_textorg)
-             && (addr < ((uintptr_t)this_ldi->ldinfo_textorg +
-                         this_ldi->ldinfo_textsize)))
-            || ((addr >= (uintptr_t)this_ldi->ldinfo_dataorg)
-                && (addr < ((uintptr_t)this_ldi->ldinfo_dataorg +
-                            this_ldi->ldinfo_datasize)))) {
-            char *buffer = NULL;
-            char *member = NULL;
-            size_t buffer_sz;
-            size_t member_len;
-
-            buffer_sz = strlen(this_ldi->ldinfo_filename) + 1;
-            member = this_ldi->ldinfo_filename + buffer_sz;
-            if ((member_len = strlen(member)) > 0) {
-                buffer_sz += 1 + member_len + 1;
-            }
-            if ((buffer = (char *)malloc(buffer_sz)) != NULL) {
-                strlcpy(buffer, this_ldi->ldinfo_filename, buffer_sz);
-                if (member_len > 0) {
-                    /*
-                     * Need to respect a possible member name and not just
-                     * returning the path name in this case. See docs:
-                     * sys/ldr.h, loadquery() and dlopen()/RTLD_MEMBER.
-                     */
-                    strlcat(buffer, "(", buffer_sz);
-                    strlcat(buffer, member, buffer_sz);
-                    strlcat(buffer, ")", buffer_sz);
-                }
-                dl->dli_fname = buffer;
-            }
-            break;
-        } else {
-            next_ldi = (struct ld_info *)((uintptr_t)this_ldi +
-                                          this_ldi->ldinfo_next);
-        }
-    } while (this_ldi->ldinfo_next);
-    free((void *)ldinfos);
-    return dl->dli_fname != NULL;
-}
-
 #endif
 
 FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
   FileSpec module_filespec;
 #ifdef _AIX
+  // TODO: As the current AIX LLDB is static, we don't need dladdr which is
+  // only for shared library, Below is the hack to find the module name
+  // for static LLDB
+  // FIXME: If LLDB is later built as shared library, we have to find the way simillar to dladdr
+  // since AIX does not support the dladdr API.
   if (host_addr == reinterpret_cast<void *>(HostInfoBase::ComputeSharedLibraryDirectory)) {
     // FIXME: AIX dladdr return "lldb" for this case
     if (p_xargv[0]) {
@@ -519,7 +367,7 @@ FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
       return module_filespec;
     }
   }
-#endif
+#else
 #if !defined(__ANDROID__)
   Dl_info info;
   if (::dladdr(host_addr, &info)) {
@@ -528,6 +376,7 @@ FileSpec Host::GetModuleFileSpecForHostAddress(const void *host_addr) {
       FileSystem::Instance().Resolve(module_filespec);
     }
   }
+#endif
 #endif
   return module_filespec;
 }

From b158b8788fa6bf0e13d02373ce9de0d62a4a6aba Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Fri, 16 May 2025 00:51:25 -0500
Subject: [PATCH 49/58] Removed netbsd license files

---
 lldb/NOTICE.TXT                               |   7 -
 .../source/Host/common/LICENSE.aix-netbsd.txt | 125 ------------------
 2 files changed, 132 deletions(-)
 delete mode 100644 lldb/NOTICE.TXT
 delete mode 100644 lldb/source/Host/common/LICENSE.aix-netbsd.txt

diff --git a/lldb/NOTICE.TXT b/lldb/NOTICE.TXT
deleted file mode 100644
index d814272967476..0000000000000
--- a/lldb/NOTICE.TXT
+++ /dev/null
@@ -1,7 +0,0 @@
-
-This product contains small piece of code to support AIX, taken from netbsd.
-
-  * LICENSE:
-    * lldb/source/Host/common/LICENSE.aix-netbsd.txt (OpenSSL License)
-  * HOMEPAGE:
-    * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist
diff --git a/lldb/source/Host/common/LICENSE.aix-netbsd.txt b/lldb/source/Host/common/LICENSE.aix-netbsd.txt
deleted file mode 100644
index 9601ab43575f9..0000000000000
--- a/lldb/source/Host/common/LICENSE.aix-netbsd.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-
-  LICENSE ISSUES
-  ==============
-
-  The OpenSSL toolkit stays under a double license, i.e. both the conditions of
-  the OpenSSL License and the original SSLeay license apply to the toolkit.
-  See below for the actual license texts.
-
-  OpenSSL License
-  ---------------
-
-/* ====================================================================
- * Copyright (c) 1998-2019 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
- */
-
- Original SSLeay License
- -----------------------
-
-/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
- *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- *
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- *
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- *
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
- */
-

From c6d2bc2fa330214c9ae5ab1841d8b724b5dec2f4 Mon Sep 17 00:00:00 2001
From: HemangGadhavi <hemang.gadhavi@ibm.com>
Date: Fri, 16 May 2025 04:57:09 -0500
Subject: [PATCH 50/58] Get base address for each moudle for image list command

---
 lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index cf11e5fb8f5a3..2e19577f2bf8b 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -717,7 +717,8 @@ lldb_private::Address ObjectFileXCOFF::GetEntryPointAddress() {
 }
 
 lldb_private::Address ObjectFileXCOFF::GetBaseAddress() {
-  return lldb_private::Address();
+  // Get base address of the section
+  return Address(GetSectionList()->GetSectionAtIndex(0), 0);
 }
 
 ObjectFile::Type ObjectFileXCOFF::CalculateType() {

From 8dd3bd995bf08e50d89a21f7c3bf227ed856798a Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 16 May 2025 18:05:57 +0530
Subject: [PATCH 51/58] Fix for GetName() (#61)

Need to access process/main-thread name from psinfo on aix rather than comm on linux.

(0) root @ unobosdev002: /home/dhruv/dio_fs/LLDB/build-lldb
# bin/lldb ../tests/1
(lldb) target create "../tests/1"
Current executable set to '/home/dhruv/dio_fs/LLDB/tests/1' (powerpc64).
(lldb) b main
Breakpoint 1: where = 1`main + 8 at 1.c:3, address = 0x0000000100000928
(lldb) r
Process 44433678 launched: '/home/dhruv/dio_fs/LLDB/tests/1' (powerpc64)
Process 44433678 stopped
* thread #1, name = '1', stop reason = breakpoint 1.1
    frame #0: 0x0000000100000928 1`main at 1.c:3
   1    int main()
   2    {
-> 3        int x = 10;
   4        x++;
   5        return x;
   6    }
(lldb) thread info
thread #1: tid = 0x2a6010e, 0x0000000100000928 1`main at 1.c:3, name = '1', stop reason = breakpoint 1.1
---
 lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
index bb14b6ab4a05e..337caa7c750e8 100644
--- a/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeThreadAIX.cpp
@@ -24,7 +24,8 @@
 #include "llvm/ADT/SmallString.h"
 
 #include "Plugins/Process/POSIX/CrashReason.h"
-
+#include <procinfo.h>
+#include <sys/procfs.h>
 #include <sys/types.h>
 #include <sys/ptrace.h>
 #include <signal.h>
@@ -102,11 +103,15 @@ NativeThreadAIX::NativeThreadAIX(NativeProcessAIX &process,
 
 std::string NativeThreadAIX::GetName() {
   NativeProcessAIX &process = GetProcess();
-
-  auto BufferOrError = getProcFile(process.GetID(), GetID(), "comm");
+  auto BufferOrError = getProcFile(process.GetID(), "psinfo");
   if (!BufferOrError)
     return "";
-  return std::string(BufferOrError.get()->getBuffer().rtrim('\n'));
+  auto &Buffer = *BufferOrError;
+  if (Buffer->getBufferSize() < sizeof(psinfo_t))
+    return "";
+  const psinfo_t *psinfo =
+      reinterpret_cast<const psinfo_t *>(Buffer->getBufferStart());
+  return std::string(psinfo->pr_fname);
 }
 
 lldb::StateType NativeThreadAIX::GetState() { return m_state; }

From 2e0029cd3afb8bba9ed8c16048214a11db353b1f Mon Sep 17 00:00:00 2001
From: Ravindra Shinde <Ravindra.Shinde2@ibm.com>
Date: Fri, 16 May 2025 03:28:24 -0500
Subject: [PATCH 52/58] Global variables are not accessed correctly

1188732:lldb, print command does not works properly sometimes

Signed-off-by: Ravindra Shinde <Ravindra.Shinde2@ibm.com>
---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index cf11e5fb8f5a3..e544417ee4c61 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -325,6 +325,7 @@ bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
   if (module_sp) {
     size_t num_loaded_sections = 0;
     SectionList *section_list = GetSectionList();
+
     if (section_list) {
       const size_t num_sections = section_list->GetSize();
       size_t sect_idx = 0;
@@ -333,16 +334,19 @@ bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
         // Iterate through the object file sections to find all of the sections
         // that have SHF_ALLOC in their flag bits.
         SectionSP section_sp(section_list->GetSectionAtIndex(sect_idx));
-        if (section_sp && !section_sp->IsThreadSpecific()) {
-          bool use_offset = false;
-          if (strcmp(section_sp->GetName().AsCString(), ".text") == 0 ||
-              strcmp(section_sp->GetName().AsCString(), ".data") == 0 ||
-              strcmp(section_sp->GetName().AsCString(), ".bss") == 0)
-            use_offset = true;
 
+        if (section_sp && !section_sp->IsThreadSpecific()) {
+          addr_t load_addr = 0;
+          if (!value_is_offset)
+            load_addr = section_sp->GetFileAddress();
+          else {
+            if (strcmp(section_sp->GetName().AsCString(), ".text") == 0)
+              load_addr = section_sp->GetFileOffset() + value;
+            else /* Other sections: data, bss, loader, dwline, dwinfo, dwabrev */
+              load_addr = section_sp->GetFileAddress() + value;
+          }
           if (target.GetSectionLoadListPublic().SetSectionLoadAddress(
-                  section_sp, (use_offset ?
-                  (section_sp->GetFileOffset() + value) : (section_sp->GetFileAddress() + value))))
+                section_sp, load_addr))
             ++num_loaded_sections;
         }
       }

From e0952b5841148298a5ae7242bb0a7f951b2f9d14 Mon Sep 17 00:00:00 2001
From: DhruvSrivastavaX <dhruv.srivastava@ibm.com>
Date: Mon, 9 Jun 2025 05:17:27 -0500
Subject: [PATCH 53/58] Stabilizing the branch to fix forced-update issue -
 Linked to Recent PRs for aix/Host.cpp, aix/Support.cpp etc

---
 lldb/include/lldb/Host/aix/Support.h |   9 -
 lldb/source/Host/aix/Host.cpp        | 257 ++++++++-------------------
 2 files changed, 74 insertions(+), 192 deletions(-)

diff --git a/lldb/include/lldb/Host/aix/Support.h b/lldb/include/lldb/Host/aix/Support.h
index 7be0c9ac98c67..f02a1904b09fa 100644
--- a/lldb/include/lldb/Host/aix/Support.h
+++ b/lldb/include/lldb/Host/aix/Support.h
@@ -18,15 +18,6 @@ namespace lldb_private {
 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
 getProcFile(::pid_t pid, ::pid_t tid, const llvm::Twine &file);
 
-<<<<<<< HEAD
-llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
-getProcFile(::pid_t pid, const llvm::Twine &file);
-
-llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
-getProcFile(const llvm::Twine &file);
-
-=======
->>>>>>> upstream/main
 } // namespace lldb_private
 
 #endif // #ifndef LLDB_HOST_AIX_SUPPORT_H
diff --git a/lldb/source/Host/aix/Host.cpp b/lldb/source/Host/aix/Host.cpp
index 9680af52598af..b5572b93d93a9 100644
--- a/lldb/source/Host/aix/Host.cpp
+++ b/lldb/source/Host/aix/Host.cpp
@@ -6,37 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cerrno>
-#include <cstdio>
-#include <cstring>
-#include <dirent.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/utsname.h>
-#include <unistd.h>
-
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ScopedPrinter.h"
-
+#include "lldb/Host/Host.h"
+#include "lldb/Host/posix/Support.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/ProcessInfo.h"
 #include "lldb/Utility/Status.h"
-
-#include "lldb/Host/FileSystem.h"
-#include "lldb/Host/Host.h"
-#include "lldb/Host/HostInfo.h"
-#include "lldb/Host/aix/Host.h"
-#include "lldb/Host/posix/Support.h"
-#include "lldb/Utility/DataExtractor.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include <dirent.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 
-using namespace llvm;
 using namespace lldb;
 using namespace lldb_private;
 
@@ -55,173 +35,98 @@ enum class ProcessState {
 };
 }
 
-namespace lldb_private {
-class ProcessLaunchInfo;
+static ProcessInstanceInfo::timespec convert(pr_timestruc64_t t) {
+  ProcessInstanceInfo::timespec ts;
+  ts.tv_sec = t.tv_sec;
+  ts.tv_usec = t.tv_nsec / 1000; // nanos to micros
+  return ts;
 }
 
-static bool GetStatusInfo(::pid_t Pid, ProcessInstanceInfo &ProcessInfo,
-                          ProcessState &State, ::pid_t &TracerPid,
-                          ::pid_t &Tgid) {
-  Log *log = GetLog(LLDBLog::Host);
-
-  auto BufferOrError = getProcFile(Pid, "status");
+static bool GetStatusInfo(::pid_t pid, ProcessInstanceInfo &processInfo,
+                          ProcessState &State) {
+  struct pstatus pstatusData;
+  auto BufferOrError = getProcFile(pid, "status");
   if (!BufferOrError)
     return false;
 
-  llvm::StringRef Rest = BufferOrError.get()->getBuffer();
-  while (!Rest.empty()) {
-    llvm::StringRef Line;
-    std::tie(Line, Rest) = Rest.split('\n');
-
-    if (Line.consume_front("Gid:")) {
-      // Real, effective, saved set, and file system GIDs. Read the first two.
-      Line = Line.ltrim();
-      uint32_t RGid, EGid;
-      Line.consumeInteger(10, RGid);
-      Line = Line.ltrim();
-      Line.consumeInteger(10, EGid);
-
-      ProcessInfo.SetGroupID(RGid);
-      ProcessInfo.SetEffectiveGroupID(EGid);
-    } else if (Line.consume_front("Uid:")) {
-      // Real, effective, saved set, and file system UIDs. Read the first two.
-      Line = Line.ltrim();
-      uint32_t RUid, EUid;
-      Line.consumeInteger(10, RUid);
-      Line = Line.ltrim();
-      Line.consumeInteger(10, EUid);
-
-      ProcessInfo.SetUserID(RUid);
-      ProcessInfo.SetEffectiveUserID(EUid);
-    } else if (Line.consume_front("PPid:")) {
-      ::pid_t PPid;
-      Line.ltrim().consumeInteger(10, PPid);
-      ProcessInfo.SetParentProcessID(PPid);
-    } else if (Line.consume_front("State:")) {
-      State = llvm::StringSwitch<ProcessState>(Line.ltrim().take_front(1))
-                  .Case("D", ProcessState::DiskSleep)
-                  .Case("I", ProcessState::Idle)
-                  .Case("R", ProcessState::Running)
-                  .Case("S", ProcessState::Sleeping)
-                  .CaseLower("T", ProcessState::TracedOrStopped)
-                  .Case("W", ProcessState::Paging)
-                  .Case("P", ProcessState::Parked)
-                  .Case("X", ProcessState::Dead)
-                  .Case("Z", ProcessState::Zombie)
-                  .Default(ProcessState::Unknown);
-      if (State == ProcessState::Unknown) {
-        LLDB_LOG(log, "Unknown process state {0}", Line);
-      }
-    } else if (Line.consume_front("TracerPid:")) {
-      Line = Line.ltrim();
-      Line.consumeInteger(10, TracerPid);
-    } else if (Line.consume_front("Tgid:")) {
-      Line = Line.ltrim();
-      Line.consumeInteger(10, Tgid);
-    }
-  }
-  return true;
-}
+  std::unique_ptr<llvm::MemoryBuffer> StatusBuffer = std::move(*BufferOrError);
+  // Ensure there's enough data for psinfoData
+  if (StatusBuffer->getBufferSize() < sizeof(pstatusData))
+    return false;
 
-static bool IsDirNumeric(const char *dname) {
-  for (; *dname; dname++) {
-    if (!isdigit(*dname))
-      return false;
+  std::memcpy(&pstatusData, StatusBuffer->getBufferStart(),
+              sizeof(pstatusData));
+  switch (pstatusData.pr_stat) {
+  case SIDL:
+    State = ProcessState::Idle;
+    break;
+  case SACTIVE:
+    State = ProcessState::Running;
+    break;
+  case SSTOP:
+    State = ProcessState::TracedOrStopped;
+    break;
+  case SZOMB:
+    State = ProcessState::Zombie;
+    break;
+  default:
+    State = ProcessState::Unknown;
+    break;
   }
+  processInfo.SetIsZombie(State == ProcessState::Zombie);
+  processInfo.SetUserTime(convert(pstatusData.pr_utime));
+  processInfo.SetSystemTime(convert(pstatusData.pr_stime));
+  processInfo.SetCumulativeUserTime(convert(pstatusData.pr_cutime));
+  processInfo.SetCumulativeSystemTime(convert(pstatusData.pr_cstime));
   return true;
 }
 
-static void GetProcessArgs(::pid_t pid, ProcessInstanceInfo &process_info) {
-  auto BufferOrError = getProcFile(pid, "cmdline");
-  if (!BufferOrError)
-    return;
-  std::unique_ptr<llvm::MemoryBuffer> Cmdline = std::move(*BufferOrError);
-
-  llvm::StringRef Arg0, Rest;
-  std::tie(Arg0, Rest) = Cmdline->getBuffer().split('\0');
-  process_info.SetArg0(Arg0);
-  while (!Rest.empty()) {
-    llvm::StringRef Arg;
-    std::tie(Arg, Rest) = Rest.split('\0');
-    process_info.GetArguments().AppendArgument(Arg);
-  }
-}
-
-static void GetExePathAndArch(::pid_t pid, ProcessInstanceInfo &process_info) {
-  Log *log = GetLog(LLDBLog::Process);
-  std::string ExePath(PATH_MAX, '\0');
-  std::string Basename(PATH_MAX, '\0');
+static bool GetExePathAndIds(::pid_t pid, ProcessInstanceInfo &process_info) {
   struct psinfo psinfoData;
+  auto BufferOrError = getProcFile(pid, "psinfo");
+  if (!BufferOrError)
+    return false;
 
-  // We can't use getProcFile here because proc/[pid]/exe is a symbolic link.
-  llvm::SmallString<64> ProcExe;
-  (llvm::Twine("/proc/") + llvm::Twine(pid) + "/cwd").toVector(ProcExe);
-
-  ssize_t len = readlink(ProcExe.c_str(), &ExePath[0], PATH_MAX);
-  if (len > 0) {
-    ExePath.resize(len);
-
-    //FIXME: hack to get basename
-    struct stat statData;
-
-    std::ostringstream oss;
-
-    oss << "/proc/" << std::dec << pid << "/psinfo";
-    assert(stat(oss.str().c_str(), &statData) == 0);
-
-    const int fd = open(oss.str().c_str(), O_RDONLY);
-    assert (fd >= 0);
-
-    ssize_t readNum = read(fd, &psinfoData, sizeof(psinfoData));
-    assert (readNum >= 0);
-
-    close (fd);
-  } else {
-    LLDB_LOG(log, "failed to read link exe link for {0}: {1}", pid,
-             Status(errno, eErrorTypePOSIX));
-    ExePath.resize(0);
-  }
-
-  llvm::StringRef PathRef = std::string(&(psinfoData.pr_psargs[0]));
-
-  if (!PathRef.empty()) {
-    process_info.GetExecutableFile().SetFile(PathRef, FileSpec::Style::native);
-    ArchSpec arch_spec = ArchSpec();
-    arch_spec.SetArchitecture(eArchTypeXCOFF, XCOFF::TCPU_PPC64, LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
-    process_info.SetArchitecture(arch_spec);
-  }
-}
+  std::unique_ptr<llvm::MemoryBuffer> PsinfoBuffer = std::move(*BufferOrError);
+  // Ensure there's enough data for psinfoData
+  if (PsinfoBuffer->getBufferSize() < sizeof(psinfoData))
+    return false;
 
-static void GetProcessEnviron(::pid_t pid, ProcessInstanceInfo &process_info) {
-  // Get the process environment.
-  auto BufferOrError = getProcFile(pid, "environ");
-  if (!BufferOrError)
-    return;
+  std::memcpy(&psinfoData, PsinfoBuffer->getBufferStart(), sizeof(psinfoData));
+  llvm::StringRef PathRef(
+      psinfoData.pr_psargs,
+      strnlen(psinfoData.pr_psargs, sizeof(psinfoData.pr_psargs)));
+  if (PathRef.empty())
+    return false;
 
-  std::unique_ptr<llvm::MemoryBuffer> Environ = std::move(*BufferOrError);
-  llvm::StringRef Rest = Environ->getBuffer();
-  while (!Rest.empty()) {
-    llvm::StringRef Var;
-    std::tie(Var, Rest) = Rest.split('\0');
-    process_info.GetEnvironment().insert(Var);
-  }
+  process_info.GetExecutableFile().SetFile(PathRef, FileSpec::Style::native);
+  ArchSpec arch_spec = ArchSpec();
+  arch_spec.SetArchitecture(eArchTypeXCOFF, llvm::XCOFF::TCPU_PPC64,
+                            LLDB_INVALID_CPUTYPE, llvm::Triple::AIX);
+  process_info.SetArchitecture(arch_spec);
+  process_info.SetParentProcessID(psinfoData.pr_ppid);
+  process_info.SetGroupID(psinfoData.pr_gid);
+  process_info.SetEffectiveGroupID(psinfoData.pr_egid);
+  process_info.SetUserID(psinfoData.pr_uid);
+  process_info.SetEffectiveUserID(psinfoData.pr_euid);
+  process_info.SetProcessGroupID(psinfoData.pr_pgid);
+  process_info.SetProcessSessionID(psinfoData.pr_sid);
+  return true;
 }
 
 static bool GetProcessAndStatInfo(::pid_t pid,
                                   ProcessInstanceInfo &process_info,
-                                  ProcessState &State, ::pid_t &tracerpid) {
-  ::pid_t tgid;
-  tracerpid = 0;
+                                  ProcessState &State) {
   process_info.Clear();
-
   process_info.SetProcessID(pid);
 
-  GetExePathAndArch(pid, process_info);
-  GetProcessArgs(pid, process_info);
-  GetProcessEnviron(pid, process_info);
-
-  // Get User and Group IDs and get tracer pid.
-  if (!GetStatusInfo(pid, process_info, State, tracerpid, tgid))
+  if (pid == LLDB_INVALID_PROCESS_ID)
+    return false;
+  // Get Executable path/Arch and Get User and Group IDs.
+  if (!GetExePathAndIds(pid, process_info))
+    return false;
+  // Get process status and timing info.
+  if (!GetStatusInfo(pid, process_info, State))
     return false;
 
   return true;
@@ -270,24 +175,10 @@ uint32_t Host::FindProcessesImpl(const ProcessInstanceInfoMatch &match_info,
 }
 
 bool Host::GetProcessInfo(lldb::pid_t pid, ProcessInstanceInfo &process_info) {
-  ::pid_t tracerpid;
   ProcessState State;
-  return GetProcessAndStatInfo(pid, process_info, State, tracerpid);
+  return GetProcessAndStatInfo(pid, process_info, State);
 }
 
-Environment Host::GetEnvironment() { return Environment(environ); }
-
 Status Host::ShellExpandArguments(ProcessLaunchInfo &launch_info) {
   return Status("unimplemented");
 }
-
-std::optional<lldb::pid_t> lldb_private::getPIDForTID(lldb::pid_t tid) {
-  ::pid_t tracerpid, tgid = LLDB_INVALID_PROCESS_ID;
-  ProcessInstanceInfo process_info;
-  ProcessState state;
-
-  if (!GetStatusInfo(tid, process_info, state, tracerpid, tgid) ||
-      tgid == LLDB_INVALID_PROCESS_ID)
-    return std::nullopt;
-  return tgid;
-}

From c12d50c4d2e63cc2a21ed18da1f794e1e9dbce17 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 9 Jun 2025 21:28:58 +0530
Subject: [PATCH 54/58] Re-Design for ObjectFileXCOFF - 64/32 bit (#72)

This is the New design for the ObjectFileXCOFF using LLVM interfaces.

The intent for which I had to implement this is that
we don't need to create new structures and can use LLVM interfaces instead, which makes the code cleaner,
safer and more structured.
And community was not accepting the older design.

As a plus, I have also made sure that it is able to handle 32 bit binaries too along with 64 bit

I have discarded the older design and only kept whats necessary, without changing any existing behaviour.

Most of this code has already been reviewed and merged in the community,
so I am pulling it together again here in our fork branch.
---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 516 +++++-------------
 .../ObjectFile/XCOFF/ObjectFileXCOFF.h        | 132 +----
 2 files changed, 167 insertions(+), 481 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 4d363915ddacd..349b3a9d5e619 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -49,9 +49,6 @@ using namespace lldb;
 using namespace lldb_private;
 
 LLDB_PLUGIN_DEFINE(ObjectFileXCOFF)
-
-char ObjectFileXCOFF::ID;
-
 // FIXME: target 64bit at this moment.
 
 // Static methods.
@@ -98,7 +95,6 @@ ObjectFile *ObjectFileXCOFF::CreateInstance(const lldb::ModuleSP &module_sp,
     return nullptr;
 
   if (!objfile_up->ParseHeader())
-    //FIXME objfile leak
     return nullptr;
 
   UGLY_FLAG_FOR_AIX = true;
@@ -111,16 +107,17 @@ bool ObjectFileXCOFF::CreateBinary() {
 
   Log *log = GetLog(LLDBLog::Object);
 
-  auto binary = llvm::object::ObjectFile::createObjectFile(
-      llvm::MemoryBufferRef(toStringRef(m_data.GetData()),
-                            m_file.GetFilename().GetStringRef()),
-      file_magic::xcoff_object_64);
+  auto memory_ref = llvm::MemoryBufferRef(toStringRef(m_data.GetData()),
+                                          m_file.GetFilename().GetStringRef());
+  llvm::file_magic magic = llvm::identify_magic(memory_ref.getBuffer());
+
+  auto binary = llvm::object::ObjectFile::createObjectFile(memory_ref, magic);
   if (!binary) {
     LLDB_LOG_ERROR(log, binary.takeError(),
                    "Failed to create binary for file ({1}): {0}", m_file);
     return false;
   }
-  // Make sure we only handle COFF format.
+  // Make sure we only handle XCOFF format.
   m_binary =
       llvm::unique_dyn_cast<llvm::object::XCOFFObjectFile>(std::move(*binary));
   if (!m_binary)
@@ -159,9 +156,9 @@ size_t ObjectFileXCOFF::GetModuleSpecifications(
 
 static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) {
   switch (magic) {
-    // TODO: 32bit not supported.
-    // case XCOFF::XCOFF32:
-    //  return sizeof(struct llvm::object::XCOFFFileHeader32);
+  case XCOFF::XCOFF32:
+    return sizeof(struct llvm::object::XCOFFFileHeader32);
+    break;
   case XCOFF::XCOFF64:
     return sizeof(struct llvm::object::XCOFFFileHeader64);
     break;
@@ -185,137 +182,9 @@ bool ObjectFileXCOFF::MagicBytesMatch(DataBufferSP &data_sp,
 }
 
 bool ObjectFileXCOFF::ParseHeader() {
-  ModuleSP module_sp(GetModule());
-  if (module_sp) {
-    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
-    m_sect_headers.clear();
-    lldb::offset_t offset = 0;
-
-    if (ParseXCOFFHeader(m_data, &offset, m_xcoff_header)) {
-      m_data.SetAddressByteSize(GetAddressByteSize());
-      if (m_xcoff_header.auxhdrsize > 0)
-        ParseXCOFFOptionalHeader(m_data, &offset);
-      ParseSectionHeaders(offset);
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ObjectFileXCOFF::ParseXCOFFHeader(lldb_private::DataExtractor &data,
-                                       lldb::offset_t *offset_ptr,
-                                       xcoff_header_t &xcoff_header) {
-  //FIXME: data.ValidOffsetForDataOfSize
-  xcoff_header.magic = data.GetU16(offset_ptr);
-  xcoff_header.nsects = data.GetU16(offset_ptr);
-  xcoff_header.modtime = data.GetU32(offset_ptr);
-  xcoff_header.symoff = data.GetU64(offset_ptr);
-  xcoff_header.auxhdrsize = data.GetU16(offset_ptr);
-  xcoff_header.flags = data.GetU16(offset_ptr);
-  xcoff_header.nsyms = data.GetU32(offset_ptr);
-  return true;
-}
-
-bool ObjectFileXCOFF::ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
-                                               lldb::offset_t *offset_ptr) {
-  lldb::offset_t init_offset = *offset_ptr;
-  //FIXME: data.ValidOffsetForDataOfSize
-  m_xcoff_aux_header.AuxMagic = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.Version = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.ReservedForDebugger = data.GetU32(offset_ptr);
-  m_xcoff_aux_header.TextStartAddr = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.DataStartAddr = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.TOCAnchorAddr = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.SecNumOfEntryPoint = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfText = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfData = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfTOC = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfLoader = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfBSS = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.MaxAlignOfText = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.MaxAlignOfData = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.ModuleType = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.CpuFlag = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.CpuType = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.TextPageSize = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.DataPageSize = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.StackPageSize = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.FlagAndTDataAlignment = data.GetU8(offset_ptr);
-  m_xcoff_aux_header.TextSize = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.InitDataSize = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.BssDataSize = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.EntryPointAddr = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.MaxStackSize = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.MaxDataSize = data.GetU64(offset_ptr);
-  m_xcoff_aux_header.SecNumOfTData = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.SecNumOfTBSS = data.GetU16(offset_ptr);
-  m_xcoff_aux_header.XCOFF64Flag = data.GetU16(offset_ptr);
-  lldb::offset_t last_offset = *offset_ptr;
-  if ((last_offset - init_offset) < m_xcoff_header.auxhdrsize)
-    *offset_ptr += (m_xcoff_header.auxhdrsize - (last_offset - init_offset));
-  return true;
-}
-
-bool ObjectFileXCOFF::ParseSectionHeaders(
-    uint32_t section_header_data_offset) {
-  const uint32_t nsects = m_xcoff_header.nsects;
-  m_sect_headers.clear();
-
-  if (nsects > 0) {
-    const size_t section_header_byte_size = nsects * m_binary->getSectionHeaderSize();
-    lldb_private::DataExtractor section_header_data =
-        ReadImageData(section_header_data_offset, section_header_byte_size);
-
-    lldb::offset_t offset = 0;
-    //FIXME: section_header_data.ValidOffsetForDataOfSize
-    m_sect_headers.resize(nsects);
-
-    for (uint32_t idx = 0; idx < nsects; ++idx) {
-      const void *name_data = section_header_data.GetData(&offset, 8);
-      if (name_data) {
-        memcpy(m_sect_headers[idx].name, name_data, 8);
-        m_sect_headers[idx].phyaddr = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].vmaddr = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].size = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].offset = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].reloff = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].lineoff = section_header_data.GetU64(&offset);
-        m_sect_headers[idx].nreloc = section_header_data.GetU32(&offset);
-        m_sect_headers[idx].nline = section_header_data.GetU32(&offset);
-        m_sect_headers[idx].flags = section_header_data.GetU32(&offset);
-        offset += 4;
-      } else {
-        offset += (m_binary->getSectionHeaderSize() - 8);
-      }
-    }
-  }
-
-  return !m_sect_headers.empty();
-}
-
-lldb_private::DataExtractor ObjectFileXCOFF::ReadImageData(uint32_t offset, size_t size) {
-  if (!size)
-    return {};
-
-  if (m_data.ValidOffsetForDataOfSize(offset, size))
-    return lldb_private::DataExtractor(m_data, offset, size);
-
-  assert(0);
-  ProcessSP process_sp(m_process_wp.lock());
-  lldb_private::DataExtractor data;
-  if (process_sp) {
-    auto data_up = std::make_unique<DataBufferHeap>(size, 0);
-    Status readmem_error;
-    size_t bytes_read =
-        process_sp->ReadMemory(offset, data_up->GetBytes(),
-                               data_up->GetByteSize(), readmem_error);
-    if (bytes_read == size) {
-      DataBufferSP buffer_sp(data_up.release());
-      data.SetData(buffer_sp, 0, buffer_sp->GetByteSize());
-    }
-  }
-  return data;
+  if (m_binary->is64Bit())
+    return m_binary->fileHeader64()->Magic == XCOFF::XCOFF64;
+  return m_binary->fileHeader32()->Magic == XCOFF::XCOFF32;
 }
 
 bool ObjectFileXCOFF::SetLoadAddress(Target &target, lldb::addr_t value,
@@ -397,10 +266,8 @@ ByteOrder ObjectFileXCOFF::GetByteOrder() const { return eByteOrderBig; }
 bool ObjectFileXCOFF::IsExecutable() const { return true; }
 
 uint32_t ObjectFileXCOFF::GetAddressByteSize() const {
-  if (m_xcoff_header.magic == XCOFF::XCOFF64)
+  if (m_binary->is64Bit())
     return 8;
-  else if (m_xcoff_header.magic == XCOFF::XCOFF32)
-    return 4;
   return 4;
 }
 
@@ -408,190 +275,147 @@ AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-lldb::SymbolType ObjectFileXCOFF::MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
   if (sym_type == llvm::object::SymbolRef::ST_Function)
     return lldb::eSymbolTypeCode;
   else if (sym_type == llvm::object::SymbolRef::ST_Data)
     return lldb::eSymbolTypeData;
+  else if (sym_type == llvm::object::SymbolRef::ST_File)
+    return lldb::eSymbolTypeSourceFile;
   return lldb::eSymbolTypeInvalid;
 }
 
 void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
-  SectionList *sect_list = GetSectionList();
-  const uint32_t num_syms = m_xcoff_header.nsyms;
-  uint32_t sidx = 0;
-  if (num_syms > 0 && m_xcoff_header.symoff > 0) {
-    const uint32_t symbol_size = XCOFF::SymbolTableEntrySize;
-    const size_t symbol_data_size = num_syms * symbol_size;
-    lldb_private::DataExtractor symtab_data =
-        ReadImageData(m_xcoff_header.symoff, symbol_data_size);
-
-    lldb::offset_t offset = 0;
-    std::string symbol_name;
-    Symbol *symbols = lldb_symtab.Resize(num_syms);
-    llvm::object::symbol_iterator SI = m_binary->symbol_begin();
-    for (uint32_t i = 0; i < num_syms; ++i, ++SI) {
-      xcoff_symbol_t symbol;
-      const uint32_t symbol_offset = offset;
-      symbol.value = symtab_data.GetU64(&offset);
-      symbol.offset = symtab_data.GetU32(&offset);
-      Expected<StringRef> symbol_name_or_err = m_binary->getStringTableEntry(symbol.offset);
-      if (!symbol_name_or_err) {
-        consumeError(symbol_name_or_err.takeError());
-        return;
-      }
-      StringRef symbol_name_str = symbol_name_or_err.get();
-      symbol_name.assign(symbol_name_str.data());
-      symbol.sect = symtab_data.GetU16(&offset);
-      symbol.type = symtab_data.GetU16(&offset);
-      symbol.storage = symtab_data.GetU8(&offset);
-      symbol.naux = symtab_data.GetU8(&offset);
-      // Allow C_HIDEXT TOC symbol, and check others.
-      if (symbol.storage == XCOFF::C_HIDEXT && strcmp(symbol_name.c_str(), "TOC") != 0) {
-        if (symbol.naux == 0)
-          continue;
-        if (symbol.naux > 1) {
-          i += symbol.naux;
-          offset += symbol.naux * symbol_size;
-          continue;
-        }
-        /* Allow XCOFF::C_HIDEXT with following SMC and AT:
-          StorageMappingClass: XMC_PR (0x0)
-          Auxiliary Type: AUX_CSECT (0xFB)
-        */
-        xcoff_sym_csect_aux_entry_t symbol_aux;
-        symbol_aux.section_or_len_low_byte = symtab_data.GetU32(&offset);
-        symbol_aux.parameter_hash_index = symtab_data.GetU32(&offset);
-        symbol_aux.type_check_sect_num = symtab_data.GetU16(&offset);
-        symbol_aux.symbol_alignment_and_type = symtab_data.GetU8(&offset);
-        symbol_aux.storage_mapping_class = symtab_data.GetU8(&offset);
-        symbol_aux.section_or_len_high_byte = symtab_data.GetU32(&offset);
-        symbol_aux.pad = symtab_data.GetU8(&offset);
-        symbol_aux.aux_type = symtab_data.GetU8(&offset);
-        offset -= symbol.naux * symbol_size;
-        if (symbol_aux.storage_mapping_class != XCOFF::XMC_PR || symbol_aux.aux_type != XCOFF::AUX_CSECT) {
-          i += symbol.naux;
-          offset += symbol.naux * symbol_size;
-          continue;
-        }
-      }
-      // Remove the dot prefix for demangle
-      if (symbol_name_str.size() > 1 && symbol_name_str.data()[0] == '.') {
-        symbols[sidx].GetMangled().SetValue(ConstString(symbol_name.c_str() + 1));
-      } else {
-        symbols[sidx].GetMangled().SetValue(ConstString(symbol_name.c_str()));
-      }
-      if ((int16_t)symbol.sect >= 1) {
-        Address symbol_addr(sect_list->GetSectionAtIndex((size_t)(symbol.sect - 1)),
-                            (symbol.value - sect_list->GetSectionAtIndex((size_t)(symbol.sect - 1))->GetFileAddress()));
-        symbols[sidx].GetAddressRef() = symbol_addr;
-
-        Expected<llvm::object::SymbolRef::Type> sym_type_or_err = SI->getType();
-        if (!sym_type_or_err) {
-          consumeError(sym_type_or_err.takeError());
-          return;
-        }
-        symbols[sidx].SetType(MapSymbolType(sym_type_or_err.get()));
+  SectionList *sectionList = GetSectionList();
+
+  for (const auto &symbol_ref : m_binary->symbols()) {
+    llvm::object::XCOFFSymbolRef xcoff_sym_ref(symbol_ref);
+    llvm::Expected<llvm::StringRef> name_or_err = xcoff_sym_ref.getName();
+    if (!name_or_err) {
+      consumeError(name_or_err.takeError());
+      continue;
+    }
+    llvm::StringRef symbolName = name_or_err.get();
+    // Remove the dot prefix for demangle
+    llvm::StringRef symbol_name =
+        symbolName.starts_with(".") ? symbolName.drop_front() : symbolName;
+    auto storageClass = xcoff_sym_ref.getStorageClass();
+    if (storageClass == XCOFF::C_HIDEXT && symbolName != "TOC") {
+      if (xcoff_sym_ref.getNumberOfAuxEntries() != 1)
+        continue;
+      auto aux_csect_or_err = xcoff_sym_ref.getXCOFFCsectAuxRef();
+      if (!aux_csect_or_err) {
+        consumeError(aux_csect_or_err.takeError());
+        continue;
       }
-      ++sidx;
+      const llvm::object::XCOFFCsectAuxRef csect_aux = aux_csect_or_err.get();
+      if (csect_aux.getStorageMappingClass() != XCOFF::XMC_PR ||
+          (m_binary->is64Bit() ? (csect_aux.getAuxType64() != XCOFF::AUX_CSECT)
+                               : false))
+        continue;
+    }
 
-      if (symbol.naux > 0) {
-        i += symbol.naux;
-        offset += symbol.naux * symbol_size;
-      }
+    Symbol symbol;
+    symbol.GetMangled().SetValue(ConstString(symbol_name));
+
+    int16_t sectionNumber = xcoff_sym_ref.getSectionNumber();
+    size_t sectionIndex = static_cast<size_t>(sectionNumber - 1);
+    if (sectionNumber > 0 && sectionIndex < sectionList->GetSize()) {
+      lldb::SectionSP section_sp =
+          sectionList->GetSectionAtIndex(sectionNumber - 1);
+      if (!section_sp || section_sp->GetFileAddress() == LLDB_INVALID_ADDRESS)
+        continue;
+      lldb::addr_t file_addr = section_sp->GetFileAddress();
+      lldb::addr_t symbolValue = xcoff_sym_ref.getValue();
+      if (symbolValue < file_addr)
+        continue;
+      symbol.GetAddressRef() = Address(section_sp, symbolValue - file_addr);
+    }
+
+    Expected<llvm::object::SymbolRef::Type> sym_type_or_err =
+        symbol_ref.getType();
+    if (!sym_type_or_err) {
+        consumeError(sym_type_or_err.takeError());
+        continue;
     }
-    lldb_symtab.Resize(sidx);
+
+    symbol.SetType(MapSymbolType(sym_type_or_err.get()));
+
+    lldb_symtab.AddSymbol(symbol);
   }
 }
 
-bool ObjectFileXCOFF::IsStripped() {
-  return false;
-}
+bool ObjectFileXCOFF::IsStripped() { return false; }
 
 void ObjectFileXCOFF::CreateSections(SectionList &unified_section_list) {
+
   if (m_sections_up)
     return;
+
   m_sections_up = std::make_unique<SectionList>();
-  ModuleSP module_sp(GetModule());
-  if (module_sp) {
-    std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
-
-    const uint32_t nsects = m_sect_headers.size();
-    ModuleSP module_sp(GetModule());
-    for (uint32_t idx = 0; idx < nsects; ++idx) {
-      llvm::StringRef sect_name = GetSectionName(m_sect_headers[idx]);
-      ConstString const_sect_name(sect_name);
-      SectionType section_type = GetSectionType(sect_name, m_sect_headers[idx]);
-
-      SectionSP section_sp(new Section(
-          module_sp,       // Module to which this section belongs
-          this,            // Object file to which this section belongs
-          idx + 1,         // Section ID is the 1 based section index.
-          const_sect_name, // Name of this section
-          section_type,
-          m_sect_headers[idx].vmaddr, // File VM address == addresses as
-                                          // they are found in the object file
-          m_sect_headers[idx].size,     // VM size in bytes of this section
-          m_sect_headers[idx].offset, // Offset to the data for this section in the file
-          m_sect_headers[idx].size, // Size in bytes of this section as found in the file
-          0, // FIXME: alignment
-          m_sect_headers[idx].flags));      // Flags for this section
-
-      // FIXME
-      uint32_t permissions = 0;
-      permissions |= ePermissionsReadable;
-      if (m_sect_headers[idx].flags & (XCOFF::STYP_DATA | XCOFF::STYP_BSS))
-        permissions |= ePermissionsWritable;
-      if (m_sect_headers[idx].flags & XCOFF::STYP_TEXT)
-        permissions |= ePermissionsExecutable;
-      section_sp->SetPermissions(permissions);
-
-      m_sections_up->AddSection(section_sp);
-      unified_section_list.AddSection(section_sp);
-    }
-  }
+  if (m_binary->is64Bit())
+    CreateSectionsWithBitness<XCOFF64>(unified_section_list);
+  else
+    CreateSectionsWithBitness<XCOFF32>(unified_section_list);
 }
 
-llvm::StringRef ObjectFileXCOFF::GetSectionName(const section_header_t &sect) {
-  llvm::StringRef hdr_name(sect.name, std::size(sect.name));
-  hdr_name = hdr_name.split('\0').first;
-  if (hdr_name.consume_front("/")) {
-    lldb::offset_t stroff;
-    if (!to_integer(hdr_name, stroff, 10))
-      return "";
-    lldb::offset_t string_file_offset =
-        m_xcoff_header.symoff + (m_xcoff_header.nsyms * static_cast<lldb::offset_t>(XCOFF::SymbolTableEntrySize)) + stroff;
-    if (const char *name = m_data.GetCStr(&string_file_offset))
-      return name;
-    return "";
-  }
-  return hdr_name;
+template <typename T>
+static auto GetSections(llvm::object::XCOFFObjectFile *binary) {
+  if constexpr (T::Is64Bit)
+    return binary->sections64();
+  else
+    return binary->sections32();
 }
 
-SectionType ObjectFileXCOFF::GetSectionType(llvm::StringRef sect_name,
-                                             const section_header_t &sect) {
-  if (sect.flags & XCOFF::STYP_TEXT)
-    return eSectionTypeCode;
-  if (sect.flags & XCOFF::STYP_DATA)
-    return eSectionTypeData;
-  if (sect.flags & XCOFF::STYP_BSS)
-    return eSectionTypeZeroFill;
-  if (sect.flags & XCOFF::STYP_DWARF) {
-    SectionType section_type =
-      llvm::StringSwitch<SectionType>(sect_name)
-      .Case(".dwinfo", eSectionTypeDWARFDebugInfo)
-      .Case(".dwline", eSectionTypeDWARFDebugLine)
-      .Case(".dwabrev", eSectionTypeDWARFDebugAbbrev)
-      .Default(eSectionTypeInvalid);
-
-    if (section_type != eSectionTypeInvalid)
-      return section_type;
+template <typename T>
+void ObjectFileXCOFF::CreateSectionsWithBitness(
+    SectionList &unified_section_list) {
+  ModuleSP module_sp(GetModule());
+  if (!module_sp)
+    return;
+
+  std::lock_guard<std::recursive_mutex> guard(module_sp->GetMutex());
+
+  int idx = 0;
+  for (const typename T::SectionHeader &section :
+       GetSections<T>(m_binary.get())) {
+
+    ConstString const_sect_name(section.Name);
+
+    SectionType section_type = lldb::eSectionTypeOther;
+    if (section.Flags & XCOFF::STYP_TEXT)
+      section_type = eSectionTypeCode;
+    else if (section.Flags & XCOFF::STYP_DATA)
+      section_type = eSectionTypeData;
+    else if (section.Flags & XCOFF::STYP_BSS)
+      section_type = eSectionTypeZeroFill;
+    else if (section.Flags & XCOFF::STYP_DWARF) {
+      section_type = llvm::StringSwitch<SectionType>(section.Name)
+                         .Case(".dwinfo", eSectionTypeDWARFDebugInfo)
+                         .Case(".dwline", eSectionTypeDWARFDebugLine)
+                         .Case(".dwabrev", eSectionTypeDWARFDebugAbbrev)
+                         .Case(".dwrnges", eSectionTypeDWARFDebugRanges)
+                         .Default(eSectionTypeInvalid);
+    }
+
+    SectionSP section_sp(new Section(
+        module_sp, this, ++idx, const_sect_name, section_type,
+        section.VirtualAddress, section.SectionSize,
+        section.FileOffsetToRawData, section.SectionSize, 0, section.Flags));
+
+    uint32_t permissions = ePermissionsReadable;
+    if (section.Flags & (XCOFF::STYP_DATA | XCOFF::STYP_BSS))
+      permissions |= ePermissionsWritable;
+    if (section.Flags & XCOFF::STYP_TEXT)
+      permissions |= ePermissionsExecutable;
+
+    section_sp->SetPermissions(permissions);
+    m_sections_up->AddSection(section_sp);
+    unified_section_list.AddSection(section_sp);
   }
-  return eSectionTypeOther;
 }
 
-void ObjectFileXCOFF::Dump(Stream *s) {
-}
+void ObjectFileXCOFF::Dump(Stream *s) {}
 
 ArchSpec ObjectFileXCOFF::GetArchitecture() {
   ArchSpec arch_spec =
@@ -630,54 +454,6 @@ uint32_t ObjectFileXCOFF::ParseDependentModules() {
     consumeError(ImportFilesOrError.takeError());
     return 0;
   }
-
-#if 0
-  StringRef ImportFileTable = ImportFilesOrError.get();
-  const char *CurrentStr = ImportFileTable.data();
-  const char *TableEnd = ImportFileTable.end();
-  const char *Basename = nullptr;
-
-  for (size_t StrIndex = 0; CurrentStr < TableEnd;
-       ++StrIndex, CurrentStr += strlen(CurrentStr) + 1) {
-    if (StrIndex >= 3 && StrIndex % 3 == 1) {
-      // base_name
-      llvm::StringRef dll_name(CurrentStr);
-      Basename = CurrentStr;
-
-      // At this moment we only have the base name of the DLL. The full path can
-      // only be seen after the dynamic loading.  Our best guess is Try to get it
-      // with the help of the object file's directory.
-      llvm::SmallString<128> dll_fullpath;
-      FileSpec dll_specs(dll_name);
-      // FIXME: hack to get libc.a loaded
-      if (strcmp(CurrentStr, "libc.a") == 0) {
-        dll_specs.GetDirectory().SetString("/usr/lib");
-      } else {
-        dll_specs.GetDirectory().SetString(m_file.GetDirectory().GetCString());
-      }
-
-      if (!llvm::sys::fs::real_path(dll_specs.GetPath(), dll_fullpath))
-        //m_deps_filespec->EmplaceBack(dll_fullpath);
-        m_deps_filespec->EmplaceBack("/usr/lib/libc.a(shr_64.o)");
-      else {
-        // Known DLLs or DLL not found in the object file directory.
-        m_deps_filespec->EmplaceBack(dll_name);
-      }
-    } else if (StrIndex >= 3 && StrIndex % 3 == 2) {
-      // archive_member_name
-      if (strcmp(CurrentStr, "") == 0) {
-        continue;
-      }
-      assert(strcmp(Basename, "") != 0);
-      std::map<std::string, std::vector<std::string>>::iterator iter = m_deps_base_members.find(std::string(Basename));
-      if (iter == m_deps_base_members.end()) {
-        m_deps_base_members[std::string(Basename)] = std::vector<std::string>();
-        iter = m_deps_base_members.find(std::string(Basename));
-      }
-      iter->second.push_back(std::string(CurrentStr));
-    }
-  }
-#endif
   return m_deps_filespec->GetSize();
 }
 
@@ -703,7 +479,8 @@ lldb_private::Address ObjectFileXCOFF::GetEntryPointAddress() {
     return m_entry_point_address;
 
   SectionList *section_list = GetSectionList();
-  addr_t vm_addr = m_xcoff_aux_header.EntryPointAddr;
+  addr_t vm_addr = m_binary->is64Bit() ? m_binary->auxiliaryHeader64()->EntryPointAddr :
+                            m_binary->auxiliaryHeader32()->EntryPointAddr;
   SectionSP section_sp(
       section_list->FindSectionContainingFileAddress(vm_addr));
   if (section_sp) {
@@ -726,9 +503,13 @@ lldb_private::Address ObjectFileXCOFF::GetBaseAddress() {
 }
 
 ObjectFile::Type ObjectFileXCOFF::CalculateType() {
-  if (m_binary->fileHeader64()->Flags & XCOFF::F_EXEC)
+
+  const auto flags = m_binary->is64Bit() ? m_binary->fileHeader64()->Flags
+                                         : m_binary->fileHeader32()->Flags;
+
+  if (flags & XCOFF::F_EXEC)
     return eTypeExecutable;
-  else if (m_binary->fileHeader64()->Flags & XCOFF::F_SHROBJ)
+  else if (flags & XCOFF::F_SHROBJ)
     return eTypeSharedLibrary;
   return eTypeUnknown;
 }
@@ -758,23 +539,18 @@ ObjectFileXCOFF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
 }
 
 ObjectFileXCOFF::ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
-                             DataBufferSP data_sp, lldb::offset_t data_offset,
-                             const FileSpec *file, lldb::offset_t file_offset,
-                             lldb::offset_t length)
-    : ObjectFile(module_sp, file, file_offset, length, data_sp, data_offset),
-      m_xcoff_header(), m_sect_headers(), m_deps_filespec(), m_deps_base_members(),
-      m_entry_point_address() {
-  ::memset(&m_xcoff_header, 0, sizeof(m_xcoff_header));
+                                 DataBufferSP data_sp,
+                                 lldb::offset_t data_offset,
+                                 const FileSpec *file,
+                                 lldb::offset_t file_offset,
+                                 lldb::offset_t length)
+    : ObjectFile(module_sp, file, file_offset, length, data_sp, data_offset) {
   if (file)
     m_file = *file;
 }
 
 ObjectFileXCOFF::ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
-                             DataBufferSP header_data_sp,
-                             const lldb::ProcessSP &process_sp,
-                             addr_t header_addr)
-    : ObjectFile(module_sp, process_sp, header_addr, header_data_sp),
-      m_xcoff_header(), m_sect_headers(), m_deps_filespec(), m_deps_base_members(),
-      m_entry_point_address() {
-  ::memset(&m_xcoff_header, 0, sizeof(m_xcoff_header));
-}
+                                 DataBufferSP header_data_sp,
+                                 const lldb::ProcessSP &process_sp,
+                                 addr_t header_addr)
+    : ObjectFile(module_sp, process_sp, header_addr, header_data_sp) {}
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
index f827fca3932f4..ce4bcaa16df5a 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h
@@ -56,18 +56,9 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile {
   static bool MagicBytesMatch(lldb::DataBufferSP &data_sp, lldb::addr_t offset,
                               lldb::addr_t length);
 
-  static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type);
-
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 
-  // LLVM RTTI support
-  static char ID;
-  bool isA(const void *ClassID) const override {
-    return ClassID == &ID || ObjectFile::isA(ClassID);
-  }
-  static bool classof(const ObjectFile *obj) { return obj->isA(&ID); }
-
   // ObjectFile Protocol.
   bool ParseHeader() override;
 
@@ -113,126 +104,45 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile {
   ObjectFile::Type CalculateType() override;
 
   ObjectFile::Strata CalculateStrata() override;
-
+  
   llvm::StringRef
   StripLinkerSymbolAnnotations(llvm::StringRef symbol_name) const override;
 
   void RelocateSection(lldb_private::Section *section) override;
 
-  lldb_private::DataExtractor ReadImageData(uint32_t offset, size_t size);
-
   ObjectFileXCOFF(const lldb::ModuleSP &module_sp, lldb::DataBufferSP data_sp,
-                lldb::offset_t data_offset, const lldb_private::FileSpec *file,
-                lldb::offset_t offset, lldb::offset_t length);
+                  lldb::offset_t data_offset,
+                  const lldb_private::FileSpec *file, lldb::offset_t offset,
+                  lldb::offset_t length);
 
   ObjectFileXCOFF(const lldb::ModuleSP &module_sp,
-                lldb::DataBufferSP header_data_sp,
-                const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
+                  lldb::DataBufferSP header_data_sp,
+                  const lldb::ProcessSP &process_sp, lldb::addr_t header_addr);
 
 protected:
-
-  typedef struct xcoff_header {
-    uint16_t magic;
-    uint16_t nsects;
-    uint32_t modtime;
-    uint64_t symoff;
-    uint32_t nsyms;
-    uint16_t auxhdrsize;
-    uint16_t flags;
-  } xcoff_header_t;
-
-  typedef struct xcoff_aux_header {
-    uint16_t AuxMagic;
-    uint16_t Version;
-    uint32_t ReservedForDebugger;
-    uint64_t TextStartAddr;
-    uint64_t DataStartAddr;
-    uint64_t TOCAnchorAddr;
-    uint16_t SecNumOfEntryPoint;
-    uint16_t SecNumOfText;
-    uint16_t SecNumOfData;
-    uint16_t SecNumOfTOC;
-    uint16_t SecNumOfLoader;
-    uint16_t SecNumOfBSS;
-    uint16_t MaxAlignOfText;
-    uint16_t MaxAlignOfData;
-    uint16_t ModuleType;
-    uint8_t CpuFlag;
-    uint8_t CpuType;
-    uint8_t TextPageSize;
-    uint8_t DataPageSize;
-    uint8_t StackPageSize;
-    uint8_t FlagAndTDataAlignment;
-    uint64_t TextSize;
-    uint64_t InitDataSize;
-    uint64_t BssDataSize;
-    uint64_t EntryPointAddr;
-    uint64_t MaxStackSize;
-    uint64_t MaxDataSize;
-    uint16_t SecNumOfTData;
-    uint16_t SecNumOfTBSS;
-    uint16_t XCOFF64Flag;
-  } xcoff_aux_header_t;
-
-  typedef struct section_header {
-    char name[8];
-    uint64_t phyaddr; // Physical Addr
-    uint64_t vmaddr;  // Virtual Addr
-    uint64_t size;    // Section size
-    uint64_t offset;  // File offset to raw data
-    uint64_t reloff;  // Offset to relocations
-    uint64_t lineoff; // Offset to line table entries
-    uint32_t nreloc;  // Number of relocation entries
-    uint32_t nline;   // Number of line table entries
-    uint32_t flags;
-  } section_header_t;
-
-  typedef struct xcoff_symbol {
-    uint64_t value;
-    uint32_t offset;
-    uint16_t sect;
-    uint16_t type;
-    uint8_t storage;
-    uint8_t naux;
-  } xcoff_symbol_t;
-
-  typedef struct xcoff_sym_csect_aux_entry {
-    uint32_t section_or_len_low_byte;
-    uint32_t parameter_hash_index;
-    uint16_t type_check_sect_num;
-    uint8_t symbol_alignment_and_type;
-    uint8_t storage_mapping_class;
-    uint32_t section_or_len_high_byte;
-    uint8_t pad;
-    uint8_t aux_type;
-  } xcoff_sym_csect_aux_entry_t;
-
-  static bool ParseXCOFFHeader(lldb_private::DataExtractor &data,
-                              lldb::offset_t *offset_ptr,
-                              xcoff_header_t &xcoff_header);
-  bool ParseXCOFFOptionalHeader(lldb_private::DataExtractor &data,
-                                lldb::offset_t *offset_ptr);
-  bool ParseSectionHeaders(uint32_t offset);
-
-  std::vector<LoadableData>
-  GetLoadableData(lldb_private::Target &target) override;
-
   static lldb::WritableDataBufferSP
   MapFileDataWritable(const lldb_private::FileSpec &file, uint64_t Size,
                       uint64_t Offset);
-  llvm::StringRef GetSectionName(const section_header_t &sect);
-  static lldb::SectionType GetSectionType(llvm::StringRef sect_name,
-                                          const section_header_t &sect);
-
+  std::vector<LoadableData>
+  GetLoadableData(lldb_private::Target &target) override;
   uint32_t ParseDependentModules();
-  typedef std::vector<section_header_t> SectionHeaderColl;
+
 
 private:
   bool CreateBinary();
+  template <typename T>
+  void
+  CreateSectionsWithBitness(lldb_private::SectionList &unified_section_list);
+
+  struct XCOFF32 {
+    using SectionHeader = llvm::object::XCOFFSectionHeader32;
+    static constexpr bool Is64Bit = false;
+  };
+  struct XCOFF64 {
+    using SectionHeader = llvm::object::XCOFFSectionHeader64;
+    static constexpr bool Is64Bit = true;
+  };
 
-  xcoff_header_t m_xcoff_header;
-  xcoff_aux_header_t m_xcoff_aux_header;
-  SectionHeaderColl m_sect_headers;
   std::unique_ptr<llvm::object::XCOFFObjectFile> m_binary;
   lldb_private::Address m_entry_point_address;
   std::optional<lldb_private::FileSpecList> m_deps_filespec;

From 0b6893565d6d36a1cae68d8f283e9f6a037c8652 Mon Sep 17 00:00:00 2001
From: DhruvSrivastavaX <dhruv.srivastava@ibm.com>
Date: Fri, 13 Jun 2025 01:55:00 -0500
Subject: [PATCH 55/58] Merge typo

---
 lldb/source/Target/RegisterContextUnwind.cpp | 57 --------------------
 1 file changed, 57 deletions(-)

diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp
index f6e3b568dfe4d..85016ed417d0c 100644
--- a/lldb/source/Target/RegisterContextUnwind.cpp
+++ b/lldb/source/Target/RegisterContextUnwind.cpp
@@ -1520,65 +1520,8 @@ RegisterContextUnwind::SavedLocationForRegister(
 
   ExecutionContext exe_ctx(m_thread.shared_from_this());
   Process *process = exe_ctx.GetProcessPtr();
-<<<<<<< HEAD
-  if (!have_unwindplan_regloc) {
-    // If the UnwindPlan failed to give us an unwind location for this
-    // register, we may be able to fall back to some ABI-defined default.  For
-    // example, some ABIs allow to determine the caller's SP via the CFA. Also,
-    // the ABI may set volatile registers to the undefined state.
-    ABI *abi = process ? process->GetABI().get() : nullptr;
-    if (abi) {
-      const RegisterInfo *reg_info =
-          GetRegisterInfoAtIndex(regnum.GetAsKind(eRegisterKindLLDB));
-      if (reg_info &&
-          abi->GetFallbackRegisterLocation(reg_info, unwindplan_regloc)) {
-        UnwindLogMsg(
-            "supplying caller's saved %s (%d)'s location using ABI default",
-            regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-        have_unwindplan_regloc = true;
-      }
-    }
-  }
-
-  if (!have_unwindplan_regloc) {
-    if (IsFrameZero()) {
-      // This is frame 0 - we should return the actual live register context
-      // value
-      lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc;
-      new_regloc.type =
-          UnwindLLDB::ConcreteRegisterLocation::eRegisterInLiveRegisterContext;
-      new_regloc.location.register_number = regnum.GetAsKind(eRegisterKindLLDB);
-#ifdef _AIX
-      if (UGLY_HACK_NULL_TOPFRAME && new_regloc.location.register_number == 0x20) {
-        new_regloc.location.register_number = 0x24;
-      }
-#endif
-      m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;
-      regloc = new_regloc;
-      UnwindLogMsg("supplying caller's register %s (%d) from the live "
-                   "RegisterContext at frame 0",
-                   regnum.GetName(), regnum.GetAsKind(eRegisterKindLLDB));
-      return UnwindLLDB::RegisterSearchResult::eRegisterFound;
-    } else {
-      std::string unwindplan_name;
-      if (m_full_unwind_plan_sp) {
-        unwindplan_name += "via '";
-        unwindplan_name += m_full_unwind_plan_sp->GetSourceName().AsCString();
-        unwindplan_name += "'";
-      }
-      UnwindLogMsg("no save location for %s (%d) %s", regnum.GetName(),
-                   regnum.GetAsKind(eRegisterKindLLDB),
-                   unwindplan_name.c_str());
-    }
-    return UnwindLLDB::RegisterSearchResult::eRegisterNotFound;
-  }
-
-  // unwindplan_regloc has valid contents about where to retrieve the register
-  if (unwindplan_regloc.IsUnspecified()) {
-=======
   // abs_regloc has valid contents about where to retrieve the register
   if (abs_regloc->IsUnspecified()) {
->>>>>>> upstream/main
     lldb_private::UnwindLLDB::ConcreteRegisterLocation new_regloc = {};
     new_regloc.type = UnwindLLDB::ConcreteRegisterLocation::eRegisterNotSaved;
     m_registers[regnum.GetAsKind(eRegisterKindLLDB)] = new_regloc;

From 9a621e65cfa3b19f460cffa83322f3e86fb6c505 Mon Sep 17 00:00:00 2001
From: DhruvSrivastavaX <dhruv.srivastava@ibm.com>
Date: Fri, 13 Jun 2025 07:38:38 -0500
Subject: [PATCH 56/58] Enabled Code formatting workflow

---
 .github/workflows/pr-code-format.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 05d69861e1841..5fcbee08789a9 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
       - main
+      - gh-101657
       - 'users/**'
 
 jobs:
@@ -16,7 +17,7 @@ jobs:
     concurrency:
       group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
       cancel-in-progress: true
-    if: github.repository == 'llvm/llvm-project'
+    if: github.repository == 'DhruvSrivastavaX/lldb-for-aix'
     steps:
       - name: Fetch LLVM sources
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

From c09f8d222e7cd29c09702d75e6091000de782e4d Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Wed, 18 Jun 2025 13:51:06 +0530
Subject: [PATCH 57/58] Merge conflict llvm-main Pull (#74)

Merge remote-tracking branch 'upstream/main' into sync-upstream

 Conflicts:
        lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
 ParseSymtab
---
 .github/new-prs-labeler.yml                   |    4 +
 bolt/include/bolt/Profile/DataAggregator.h    |   60 +-
 bolt/lib/Profile/DataAggregator.cpp           |  237 +-
 bolt/test/X86/callcont-fallthru.s             |   72 +-
 bolt/test/lit.local.cfg                       |    9 +-
 .../clang-doc/Representation.cpp              |    1 +
 clang-tools-extra/clang-doc/Serialize.cpp     |    1 +
 .../UnnecessaryValueParamCheck.cpp            |   18 +-
 .../performance/UnnecessaryValueParamCheck.h  |    1 +
 clang-tools-extra/clangd/ClangdLSPServer.cpp  |   10 +-
 clang-tools-extra/clangd/Protocol.cpp         |   20 +-
 clang-tools-extra/clangd/Protocol.h           |    5 +-
 .../clangd/test/positionencoding.test         |   32 +
 clang-tools-extra/docs/ReleaseNotes.rst       |    2 +
 .../checks/performance/enum-size.rst          |    2 +-
 .../performance/unnecessary-value-param.rst   |    9 +-
 .../unnecessary-value-param-coroutine.cpp     |   65 +
 clang/docs/ReleaseNotes.rst                   |   11 +-
 clang/docs/Toolchain.rst                      |    5 +
 clang/docs/UsersManual.rst                    |   13 +
 clang/include/clang/AST/ASTContext.h          |   39 +
 clang/include/clang/Basic/AttrDocs.td         |    2 +-
 clang/include/clang/Basic/CodeGenOptions.def  |    6 +-
 .../clang/Basic/DiagnosticDriverKinds.td      |    3 +
 clang/include/clang/Basic/DiagnosticGroups.td |   19 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |    2 +
 clang/include/clang/CIR/MissingFeatures.h     |    1 +
 clang/include/clang/Driver/Options.td         |   35 +-
 clang/include/clang/Parse/Parser.h            |    2 +-
 clang/include/clang/Sema/Scope.h              |   11 +
 clang/include/clang/Sema/SemaHLSL.h           |   13 +
 clang/include/clang/Serialization/ASTWriter.h |    4 +
 .../clang/StaticAnalyzer/Checkers/Checkers.td |   53 +-
 clang/lib/AST/ASTContext.cpp                  |   67 +
 clang/lib/AST/ByteCode/InterpBlock.cpp        |   14 +-
 clang/lib/AST/ByteCode/InterpStack.cpp        |   19 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |   39 +-
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          |    2 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |    6 +-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |   30 +
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp  |  239 +-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |   10 +-
 clang/lib/CIR/CodeGen/CIRGenModule.h          |    4 +
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp          |    3 +-
 clang/lib/CIR/CodeGen/CIRGenValue.h           |   27 +-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |   98 +
 .../Dialect/Transforms/CIRCanonicalize.cpp    |    2 +-
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |    8 -
 clang/lib/CodeGen/CodeGenFunction.cpp         |    1 -
 clang/lib/CodeGen/CodeGenModule.cpp           |   11 +-
 clang/lib/Driver/Driver.cpp                   |    8 +
 clang/lib/Driver/ToolChains/BareMetal.cpp     |  235 +-
 clang/lib/Driver/ToolChains/BareMetal.h       |   19 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   23 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp       |    4 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |    4 +-
 clang/lib/Headers/CMakeLists.txt              |    1 +
 clang/lib/Headers/__clang_cuda_intrinsics.h   |  284 +
 .../Headers/cuda_wrappers/bits/c++config.h    |   51 +
 clang/lib/Parse/CMakeLists.txt                |    1 +
 clang/lib/Parse/ParseDeclCXX.cpp              |   36 +-
 clang/lib/Sema/SemaDecl.cpp                   |   40 +-
 clang/lib/Sema/SemaExpr.cpp                   |    1 +
 clang/lib/Sema/SemaHLSL.cpp                   |   33 +-
 clang/lib/Sema/SemaTypeTraits.cpp             |   17 +-
 clang/lib/Serialization/ASTWriter.cpp         |    5 +-
 .../Checkers/DivZeroChecker.cpp               |    8 +-
 .../StaticAnalyzer/Checkers/MallocChecker.cpp |   22 +
 .../Checkers/NullabilityChecker.cpp           |  177 +-
 .../Core/Z3CrosscheckVisitor.cpp              |    2 +-
 .../test/Analysis/NewDelete-checker-test.cpp  |   38 +-
 .../test/Analysis/analyzer-enabled-checkers.c |    1 -
 clang/test/Analysis/bugfix-124477.m           |    2 +-
 ...c-library-functions-arg-enabled-checkers.c |    1 -
 clang/test/Analysis/taint-generic.c           |   13 +
 clang/test/CIR/CodeGen/builtin_call.cpp       |   18 +
 clang/test/CIR/CodeGen/builtin_printf.cpp     |   65 +
 clang/test/CIR/CodeGen/complex.cpp            |   29 +
 clang/test/CIR/CodeGen/string-literals.c      |   12 +
 clang/test/CIR/CodeGen/string-literals.cpp    |   23 +
 clang/test/CIR/Transforms/vector-cmp-fold.cir |  227 +
 .../aarch64-always-inline-feature-bug.c       |    8 +
 clang/test/CodeGen/epilog-unwind.c            |   10 +-
 clang/test/CodeGenHLSL/ArrayAssignable.hlsl   |   30 +-
 clang/test/CodeGenHLSL/ArrayTemporary.hlsl    |   12 +-
 .../BasicFeatures/ArrayOutputArguments.hlsl   |   14 +-
 .../CodeGenHLSL/BasicFeatures/InitLists.hlsl  |   36 +-
 .../BasicFeatures/OutputArguments.hlsl        |   16 +-
 clang/test/CodeGenHLSL/Bool.hlsl              |    2 +-
 clang/test/CodeGenHLSL/BoolVector.hlsl        |   14 +-
 .../CodeGenHLSL/GlobalConstructorLib.hlsl     |    2 +-
 clang/test/CodeGenHLSL/basic_types.hlsl       |   64 +-
 .../test/CodeGenHLSL/builtins/AddUint64.hlsl  |    4 +-
 .../ByteAddressBuffers-constructors.hlsl      |   14 +-
 .../GroupMemoryBarrierWithGroupSync.hlsl      |    8 +-
 .../builtins/RWBuffer-constructor.hlsl        |   12 +-
 .../CodeGenHLSL/builtins/ScalarSwizzles.hlsl  |    2 +-
 .../StructuredBuffers-constructors.hlsl       |   12 +-
 clang/test/CodeGenHLSL/builtins/abs.hlsl      |   56 +-
 clang/test/CodeGenHLSL/builtins/all.hlsl      |    8 +-
 clang/test/CodeGenHLSL/builtins/and.hlsl      |   12 +-
 clang/test/CodeGenHLSL/builtins/any.hlsl      |    8 +-
 .../CodeGenHLSL/builtins/ceil-overloads.hlsl  |   40 +-
 clang/test/CodeGenHLSL/builtins/ceil.hlsl     |   24 +-
 .../CodeGenHLSL/builtins/clamp-overloads.hlsl |    8 +-
 clang/test/CodeGenHLSL/builtins/clamp.hlsl    |    8 +-
 .../CodeGenHLSL/builtins/clip-builtin.hlsl    |    2 +-
 clang/test/CodeGenHLSL/builtins/clip.hlsl     |    8 +-
 .../CodeGenHLSL/builtins/cos-overloads.hlsl   |   40 +-
 clang/test/CodeGenHLSL/builtins/cos.hlsl      |   24 +-
 clang/test/CodeGenHLSL/builtins/cross.hlsl    |    8 +-
 .../builtins/degrees-overloads.hlsl           |    4 +-
 clang/test/CodeGenHLSL/builtins/degrees.hlsl  |    8 +-
 clang/test/CodeGenHLSL/builtins/distance.hlsl |   32 +-
 .../CodeGenHLSL/builtins/exp-overloads.hlsl   |   40 +-
 clang/test/CodeGenHLSL/builtins/exp.hlsl      |   24 +-
 .../CodeGenHLSL/builtins/exp2-overloads.hlsl  |   40 +-
 clang/test/CodeGenHLSL/builtins/exp2.hlsl     |   24 +-
 .../CodeGenHLSL/builtins/firstbithigh.hlsl    |    8 +
 .../CodeGenHLSL/builtins/firstbitlow.hlsl     |    8 +
 .../CodeGenHLSL/builtins/floor-overloads.hlsl |   40 +-
 clang/test/CodeGenHLSL/builtins/floor.hlsl    |   24 +-
 clang/test/CodeGenHLSL/builtins/fmod.hlsl     |    8 +-
 .../CodeGenHLSL/builtins/frac-overloads.hlsl  |    4 +-
 clang/test/CodeGenHLSL/builtins/frac.hlsl     |    8 +-
 .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl |   14 +-
 .../CodeGenHLSL/builtins/isinf-overloads.hlsl |    8 +-
 clang/test/CodeGenHLSL/builtins/isinf.hlsl    |   16 +-
 clang/test/CodeGenHLSL/builtins/ldexp.hlsl    |   16 +-
 clang/test/CodeGenHLSL/builtins/length.hlsl   |   45 +-
 .../CodeGenHLSL/builtins/lerp-overloads.hlsl  |    8 +-
 .../CodeGenHLSL/builtins/log-overloads.hlsl   |   40 +-
 clang/test/CodeGenHLSL/builtins/log.hlsl      |   24 +-
 .../CodeGenHLSL/builtins/log10-overloads.hlsl |   40 +-
 clang/test/CodeGenHLSL/builtins/log10.hlsl    |   24 +-
 .../CodeGenHLSL/builtins/log2-overloads.hlsl  |   40 +-
 clang/test/CodeGenHLSL/builtins/log2.hlsl     |   24 +-
 .../CodeGenHLSL/builtins/max-overloads.hlsl   |   22 +-
 clang/test/CodeGenHLSL/builtins/max.hlsl      |   80 +-
 .../CodeGenHLSL/builtins/min-overloads.hlsl   |   22 +-
 clang/test/CodeGenHLSL/builtins/min.hlsl      |   82 +-
 .../builtins/normalize-overloads.hlsl         |    4 +-
 .../test/CodeGenHLSL/builtins/normalize.hlsl  |    8 +-
 clang/test/CodeGenHLSL/builtins/or.hlsl       |   14 +-
 .../CodeGenHLSL/builtins/pow-overloads.hlsl   |   40 +-
 clang/test/CodeGenHLSL/builtins/pow.hlsl      |   24 +-
 .../builtins/radians-overloads.hlsl           |    4 +-
 clang/test/CodeGenHLSL/builtins/radians.hlsl  |    8 +-
 clang/test/CodeGenHLSL/builtins/rcp.hlsl      |   64 +-
 clang/test/CodeGenHLSL/builtins/reflect.hlsl  |   32 +-
 .../CodeGenHLSL/builtins/reversebits.hlsl     |   24 +-
 .../CodeGenHLSL/builtins/round-overloads.hlsl |   40 +-
 clang/test/CodeGenHLSL/builtins/round.hlsl    |   24 +-
 .../CodeGenHLSL/builtins/rsqrt-overloads.hlsl |    4 +-
 clang/test/CodeGenHLSL/builtins/rsqrt.hlsl    |    8 +-
 clang/test/CodeGenHLSL/builtins/sign.hlsl     |    8 +-
 .../CodeGenHLSL/builtins/sin-overloads.hlsl   |   40 +-
 clang/test/CodeGenHLSL/builtins/sin.hlsl      |   24 +-
 .../test/CodeGenHLSL/builtins/smoothstep.hlsl |   32 +-
 .../CodeGenHLSL/builtins/splitdouble.hlsl     |   10 +-
 .../CodeGenHLSL/builtins/sqrt-overloads.hlsl  |   40 +-
 clang/test/CodeGenHLSL/builtins/sqrt.hlsl     |   24 +-
 .../CodeGenHLSL/builtins/step-overloads.hlsl  |    4 +-
 clang/test/CodeGenHLSL/builtins/step.hlsl     |    8 +-
 .../CodeGenHLSL/builtins/trunc-overloads.hlsl |   40 +-
 clang/test/CodeGenHLSL/builtins/trunc.hlsl    |   24 +-
 .../wave_get_lane_index_do_while.hlsl         |    2 +-
 .../builtins/wave_get_lane_index_simple.hlsl  |    4 +-
 .../builtins/wave_get_lane_index_subcall.hlsl |    4 +-
 clang/test/CodeGenHLSL/cbuffer.hlsl           |   92 +-
 .../CodeGenHLSL/cbuffer_and_namespaces.hlsl   |    8 +-
 .../CodeGenHLSL/cbuffer_with_packoffset.hlsl  |   10 +-
 ...uffer_with_static_global_and_function.hlsl |    2 +-
 .../CodeGenHLSL/convergence/do.while.hlsl     |   10 +-
 clang/test/CodeGenHLSL/convergence/for.hlsl   |   14 +-
 clang/test/CodeGenHLSL/convergence/while.hlsl |   12 +-
 clang/test/CodeGenHLSL/default_cbuffer.hlsl   |   12 +-
 .../default_cbuffer_with_layout.hlsl          |   12 +-
 clang/test/CodeGenHLSL/export.hlsl            |   10 +-
 clang/test/CodeGenHLSL/group_shared.hlsl      |    2 +-
 .../implicit-norecurse-attrib.hlsl            |   11 +-
 clang/test/CodeGenHLSL/inline-functions.hlsl  |   17 +-
 .../CodeGenHLSL/inline-spirv/SpirvType.hlsl   |    4 +-
 clang/test/CodeGenHLSL/no_int_promotion.hlsl  |   14 +-
 .../test/CodeGenHLSL/out-of-line-static.hlsl  |    4 +-
 clang/test/CodeGenHLSL/shift-mask.hlsl        |   16 +-
 .../CodeGenHLSL/this-assignment-overload.hlsl |    4 +-
 clang/test/CodeGenHLSL/vk-input-builtin.hlsl  |    2 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |    0
 .../aarch64-none-elf/lib/.keep                |    0
 .../aarch64-none-elf/lib/crt0.o               |    0
 .../bin/aarch64-none-elf-ld                   |    1 +
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |    0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |    0
 .../aarch64-none-elf/lib/crt0.o               |    0
 .../aarch64-none-elf/lib/crtbegin.o           |    0
 .../aarch64-none-elf/lib/crtend.o             |    0
 .../bin/aarch64-none-elf-ld                   |    1 +
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |    0
 .../armv6m-none-eabi/lib/.keep                |    0
 .../armv6m-none-eabi/lib/crt0.o               |    0
 .../bin/armv6m-none-eabi-ld                   |    1 +
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |    0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |    0
 .../armv6m-none-eabi/lib/crt0.o               |    0
 .../armv6m-none-eabi/lib/crtbegin.o           |    0
 .../armv6m-none-eabi/lib/crtend.o             |    0
 .../bin/armv6m-none-eabi-ld                   |    1 +
 clang/test/Driver/aarch64-gnutools.c          |    4 +
 clang/test/Driver/aarch64-toolchain-extra.c   |   28 +
 clang/test/Driver/aarch64-toolchain.c         |   61 +
 clang/test/Driver/android-link.cpp            |   12 +
 clang/test/Driver/arm-gnutools.c              |    6 +
 clang/test/Driver/arm-toolchain-extra.c       |   29 +
 clang/test/Driver/arm-toolchain.c             |   62 +
 clang/test/Driver/baremetal.cpp               |   16 +
 clang/test/Driver/check-no-multlib-warning.c  |   10 +
 clang/test/Driver/cl-options.c                |    6 +-
 clang/test/Driver/fveclib.c                   |   10 +-
 clang/test/Driver/ignored-pch.cpp             |   19 +
 .../riscv-andes-a25.c                         |    7 +-
 .../riscv-andes-a45.c                         |    6 +-
 .../riscv-andes-ax25.c                        |    7 +-
 .../riscv-andes-ax45.c                        |    6 +-
 .../riscv-andes-n45.c                         |    6 +-
 .../riscv-andes-nx45.c                        |    6 +-
 .../Driver/print-supported-extensions-riscv.c |    9 +-
 .../Modules/preferred_name_header_unit.cpp    |   64 +
 .../enable_16bit_types_validation_spirv.hlsl  |    2 +-
 clang/test/PCH/Inputs/ignored-pch.h           |    6 +
 clang/test/PCH/ignored-pch.c                  |  113 +
 ...-invalid-using-decl-in-constexpr-crash.cpp |    8 +
 .../riscv-target-features-andes.c             |    8 +
 clang/test/Sema/gh87867.c                     |   33 +
 .../SemaCXX/builtin-is-constant-evaluated.cpp |   14 +
 clang/test/SemaCXX/class.cpp                  |   28 +-
 clang/test/SemaCXX/cxx0x-class.cpp            |   11 +-
 clang/test/SemaCXX/cxx2a-consteval.cpp        |    8 +-
 .../SemaCXX/cxx2c-trivially-relocatable.cpp   |    1 +
 clang/test/SemaCXX/ptrauth-triviality.cpp     |   44 +-
 .../SemaCXX/trivially-relocatable-ptrauth.cpp |  109 +
 .../SemaTemplate/instantiate-static-var.cpp   |   10 +-
 .../ClangLinkerWrapper.cpp                    |    3 +-
 clang/unittests/Parse/CMakeLists.txt          |    1 +
 clang/utils/TableGen/ClangAttrEmitter.cpp     |    3 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |    6 +-
 compiler-rt/lib/msan/msan_interceptors.cpp    |    2 +-
 .../lib/sanitizer_common/sanitizer_platform.h |    9 +-
 .../include/flang-rt/runtime/environment.h    |    3 +
 flang-rt/include/flang-rt/runtime/stat.h      |   10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |   15 +-
 .../include/flang-rt/runtime/work-queue.h     |  555 ++
 flang-rt/lib/runtime/CMakeLists.txt           |    2 +
 flang-rt/lib/runtime/allocatable.cpp          |   20 +
 flang-rt/lib/runtime/assign.cpp               |  663 +-
 flang-rt/lib/runtime/derived.cpp              |  516 +-
 flang-rt/lib/runtime/descriptor-io.cpp        |  668 +-
 flang-rt/lib/runtime/descriptor-io.h          |  620 +-
 flang-rt/lib/runtime/environment.cpp          |    4 +
 flang-rt/lib/runtime/namelist.cpp             |    1 +
 flang-rt/lib/runtime/tools.cpp                |    4 +-
 flang-rt/lib/runtime/type-info.cpp            |   12 +-
 flang-rt/lib/runtime/work-queue.cpp           |  161 +
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |    2 +-
 flang/docs/Extensions.md                      |   10 +
 .../include/flang/Optimizer/Dialect/FIROps.td |   18 +-
 flang/include/flang/Parser/dump-parse-tree.h  |    1 +
 flang/include/flang/Parser/parse-tree.h       |    6 +
 flang/include/flang/Runtime/assign.h          |    3 +-
 flang/include/flang/Semantics/tools.h         |    7 +-
 flang/include/flang/Support/OpenMP-features.h |    3 +
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |   50 +-
 flang/lib/Lower/OpenMP/Clauses.cpp            |    4 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   44 +-
 flang/lib/Optimizer/Dialect/FIROps.cpp        |   21 +-
 .../Transforms/ControlFlowConverter.cpp       |    5 +-
 flang/lib/Parser/openmp-parsers.cpp           |    2 +
 flang/lib/Semantics/check-allocate.cpp        |   51 +
 flang/lib/Semantics/check-omp-structure.cpp   |    9 +
 flang/lib/Semantics/mod-file.cpp              |   44 +-
 flang/lib/Semantics/resolve-directives.cpp    |   18 +-
 flang/lib/Semantics/resolve-names.cpp         |    1 -
 flang/lib/Semantics/runtime-type-info.cpp     |   88 +-
 flang/lib/Semantics/tools.cpp                 |   32 +
 flang/module/__fortran_type_info.f90          |    5 +-
 flang/test/CMakeLists.txt                     |    1 +
 .../Driver/flang-openmp-version-macro.f90     |    3 +-
 flang/test/Fir/cfg-conversion-if.fir          |   46 +
 flang/test/Fir/fir-ops.fir                    |   16 +
 flang/test/Fir/invalid.fir                    |   28 +
 .../Lower/OpenMP/Todo/omp-clause-indirect.f90 |   34 +
 .../OpenMP/Todo/target-parallel-private.f90   |   13 -
 .../OpenMP/Todo/target-teams-private.f90      |   13 -
 flang/test/Lower/OpenMP/copyprivate5.f90      |   36 +
 flang/test/Lower/OpenMP/implicit-dsa.f90      |   23 +-
 .../Lower/OpenMP/target-parallel-private.f90  |   21 +
 .../Lower/OpenMP/target-teams-private.f90     |   20 +
 flang/test/Lower/OpenMP/taskgroup02.f90       |   32 +
 flang/test/Lower/volatile-openmp.f90          |    8 +-
 .../OpenMP/declare-target-indirect-tree.f90   |   53 +
 flang/test/Preprocessing/bug518.F             |    2 +-
 flang/test/Semantics/OpenMP/implicit-dsa.f90  |    6 +-
 .../Semantics/OpenMP/parallel-sections01.f90  |    2 +
 flang/test/Semantics/OpenMP/sections-goto.f90 |   11 +
 flang/test/Semantics/OpenMP/sections02.f90    |    2 +
 flang/test/Semantics/allocate11.f90           |    1 +
 flang/test/Semantics/indirect01.f90           |   34 +
 flang/test/Semantics/indirect02.f90           |   36 +
 flang/test/Semantics/modfile71.F90            |    7 +-
 flang/test/Semantics/modfile75.F90            |    3 +-
 flang/test/Semantics/modfile76.F90            |   15 +-
 flang/test/Semantics/modfile77.F90            |    3 +-
 flang/test/Semantics/modfile78.F90            |    3 +-
 flang/test/Semantics/modfile79.F90            |   34 +
 flang/test/Semantics/typeinfo01.f90           |   34 +-
 flang/test/Semantics/typeinfo03.f90           |    2 +-
 flang/test/Semantics/typeinfo04.f90           |    8 +-
 flang/test/Semantics/typeinfo05.f90           |    4 +-
 flang/test/Semantics/typeinfo06.f90           |    4 +-
 flang/test/Semantics/typeinfo07.f90           |    8 +-
 flang/test/Semantics/typeinfo08.f90           |    2 +-
 flang/test/Semantics/typeinfo11.f90           |    2 +-
 flang/test/Semantics/typeinfo12.f90           |   67 +
 flang/test/Semantics/typeinfo13.f90           |    2 +-
 libc/hdr/types/char8_t.h                      |    8 -
 libc/src/__support/FPUtil/dyadic_float.h      |    5 +-
 libc/src/__support/HashTable/CMakeLists.txt   |    5 +-
 libc/src/__support/HashTable/table.h          |   15 +-
 libc/src/__support/wchar/CMakeLists.txt       |    9 +-
 .../__support/wchar/character_converter.cpp   |  125 +-
 .../src/__support/wchar/character_converter.h |    8 +-
 libc/src/__support/wchar/mbstate.h            |    9 +
 libc/src/__support/wchar/utf_ret.h            |   24 -
 libc/src/stdio/baremetal/printf.cpp           |    8 +-
 libc/src/stdio/baremetal/putchar.cpp          |    2 +-
 libc/src/stdio/baremetal/puts.cpp             |    4 +-
 libc/src/stdio/baremetal/vprintf.cpp          |    8 +-
 libc/test/src/__support/CMakeLists.txt        |    5 +
 .../src/__support/HashTable/table_test.cpp    |    4 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |   21 +
 .../src/__support/wchar/utf32_to_8_test.cpp   |  180 +
 .../src/__support/wchar/utf8_to_32_test.cpp   |  196 +
 libc/test/src/stdio/CMakeLists.txt            |   13 +
 libc/test/src/stdio/fdopen_test.cpp           |   14 +-
 libc/test/src/stdio/fgetc_test.cpp            |   22 +-
 libc/test/src/stdio/fgetc_unlocked_test.cpp   |   22 +-
 libc/test/src/stdio/fgets_test.cpp            |   19 +-
 libc/test/src/stdio/fileop_test.cpp           |   20 +-
 libc/test/src/stdio/fopencookie_test.cpp      |   15 +-
 libc/test/src/stdio/remove_test.cpp           |   10 +-
 libc/test/src/stdio/rename_test.cpp           |    9 +-
 libc/test/src/stdio/setvbuf_test.cpp          |    9 +-
 libc/test/src/stdio/unlocked_fileop_test.cpp  |    7 +-
 libc/test/src/stdlib/StrtolTest.h             |    1 -
 libc/test/src/stdlib/strtold_test.cpp         |    1 -
 libcxx/include/__config                       |    9 +-
 libcxx/include/__utility/pair.h               |   15 +-
 libcxx/include/fstream                        |   12 +-
 libcxx/include/streambuf                      |   53 +-
 .../streambuf.get.area/setg.assert.pass.cpp   |    2 +-
 .../streambuf.put.area/setp.assert.pass.cpp   |    2 +-
 lld/MachO/DriverUtils.cpp                     |   40 +-
 lld/test/ELF/lto/aarch64-pac-got-func.ll      |   50 +-
 lld/test/MachO/reexport-with-symlink.s        |   74 +
 lldb/cmake/modules/FindPythonAndSwig.cmake    |    4 +-
 lldb/include/lldb/Core/PluginManager.h        |  108 +-
 .../test/tools/lldb-dap/dap_server.py         |    8 +-
 lldb/source/Core/DynamicLoader.cpp            |    2 +
 lldb/source/Core/PluginManager.cpp            |  441 +-
 .../Clang/IRDynamicChecks.cpp                 |  116 +-
 .../ExpressionParser/Clang/IRDynamicChecks.h  |    1 -
 .../Language/CPlusPlus/CxxStringTypes.cpp     |  102 +-
 .../Language/CPlusPlus/CxxStringTypes.h       |   29 +
 .../Plugins/Language/CPlusPlus/LibCxx.cpp     |  147 +-
 .../Language/CPlusPlus/LibCxxUnorderedMap.cpp |   18 +-
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      |   82 +-
 .../Process/minidump/MinidumpParser.cpp       |   72 +-
 .../Plugins/Process/minidump/MinidumpParser.h |   21 +-
 lldb/test/API/commands/plugin/TestPlugin.py   |   62 +
 .../string/TestDataFormatterLibcxxString.py   |    8 +-
 .../TestDataFormatterLibcxxStringView.py      |    8 +-
 lldb/test/API/lit.cfg.py                      |    7 +
 lldb/test/API/lit.site.cfg.py.in              |    1 +
 .../TestDAP_setExceptionBreakpoints.py        |   11 +-
 .../tools/lldb-dap/exception/objc/Makefile    |    2 +-
 .../exception/objc/TestDAP_exception_objc.py  |   39 +-
 .../API/tools/lldb-dap/exception/objc/main.m  |   12 +-
 .../lldb-dap/save-core/TestDAP_save_core.py   |    6 -
 .../stepInTargets/TestDAP_stepInTargets.py    |   50 +
 .../Shell/Commands/command-plugin-list.test   |    8 +-
 .../Shell/ObjectFile/XCOFF/symbol-info.yaml   |  121 +
 .../Shell/ObjectFile/XCOFF/symbol-info32.yaml |  124 +
 lldb/test/Shell/lit.cfg.py                    |    3 +
 lldb/tools/debugserver/source/RNBRemote.cpp   |    1 -
 lldb/tools/lldb-dap/DAP.cpp                   |  158 +-
 lldb/tools/lldb-dap/DAP.h                     |    4 +-
 lldb/tools/lldb-dap/EventHelper.cpp           |    5 +
 lldb/tools/lldb-dap/ExceptionBreakpoint.cpp   |   26 +-
 lldb/tools/lldb-dap/ExceptionBreakpoint.h     |   14 +-
 .../Handler/DisassembleRequestHandler.cpp     |   34 +-
 .../Handler/InitializeRequestHandler.cpp      |    1 -
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |   28 +-
 .../SetExceptionBreakpointsRequestHandler.cpp |  107 +-
 .../Handler/StepInTargetsRequestHandler.cpp   |  198 +-
 lldb/tools/lldb-dap/JSONUtils.cpp             |    9 -
 lldb/tools/lldb-dap/JSONUtils.h               |   12 -
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |   24 +
 .../lldb-dap/Protocol/ProtocolRequests.h      |   65 +
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |   80 +-
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |   69 +-
 lldb/tools/lldb-dap/ProtocolUtils.cpp         |   11 +
 lldb/tools/lldb-dap/ProtocolUtils.h           |   13 +
 lldb/tools/lldb-dap/package.json              |   18 +-
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  129 +-
 llvm/cmake/config-ix.cmake                    |    5 +
 llvm/cmake/modules/AddLLVM.cmake              |   13 +-
 llvm/cmake/modules/HandleLLVMOptions.cmake    |   10 +-
 llvm/docs/AMDGPUUsage.rst                     |    3 +-
 llvm/docs/InstCombineContributorGuide.md      |   39 +
 llvm/docs/RISCVUsage.rst                      |   39 +-
 llvm/docs/ReleaseNotes.md                     |    1 +
 llvm/include/llvm-c/ExecutionEngine.h         |    8 +
 llvm/include/llvm/ADT/GenericUniformityImpl.h |    2 +-
 .../llvm/Analysis/IRSimilarityIdentifier.h    |    4 -
 llvm/include/llvm/Analysis/PtrUseVisitor.h    |    1 -
 llvm/include/llvm/Analysis/VecFuncs.def       |  260 +
 llvm/include/llvm/BinaryFormat/DXContainer.h  |    6 +-
 .../BinaryFormat/DXContainerConstants.def     |   41 +-
 llvm/include/llvm/BinaryFormat/Dwarf.h        |   12 +-
 .../ELFRelocs/RISCV_nonstandard.def           |    2 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |    4 +-
 llvm/include/llvm/CodeGen/SDPatternMatch.h    |    4 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |   14 +-
 llvm/include/llvm/Config/config.h.cmake       |    3 +
 llvm/include/llvm/Config/llvm-config.h.cmake  |    4 +-
 .../llvm/DebugInfo/DWARF/DWARFCFIPrinter.h    |   28 +
 .../llvm/DebugInfo/DWARF/DWARFCFIProgram.h    |   62 +-
 .../llvm/Frontend/HLSL/HLSLRootSignature.h    |    1 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |   68 +
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h   |    2 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |    5 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |    2 +-
 llvm/include/llvm/IR/DebugLoc.h               |   14 +-
 llvm/include/llvm/IR/InstVisitor.h            |   10 -
 llvm/include/llvm/IR/Module.h                 |    4 +
 llvm/include/llvm/IR/RuntimeLibcalls.h        |   40 +-
 llvm/include/llvm/InitializePasses.h          |  589 +-
 llvm/include/llvm/MC/MCAsmInfo.h              |    5 +-
 llvm/include/llvm/MC/MCExpr.h                 |    7 -
 llvm/include/llvm/Object/ELF.h                |    4 +-
 .../ObjectYAML/CodeViewYAMLDebugSections.h    |   12 +-
 .../llvm/ObjectYAML/CodeViewYAMLSymbols.h     |    6 +-
 .../llvm/ObjectYAML/CodeViewYAMLTypes.h       |   14 +-
 llvm/include/llvm/ObjectYAML/DWARFEmitter.h   |   39 +-
 llvm/include/llvm/ObjectYAML/DWARFYAML.h      |   64 +-
 .../include/llvm/ObjectYAML/DXContainerYAML.h |   81 +-
 llvm/include/llvm/ObjectYAML/YAML.h           |    9 +-
 llvm/include/llvm/ObjectYAML/yaml2obj.h       |   42 +-
 llvm/include/llvm/Option/OptSpecifier.h       |    2 +
 llvm/include/llvm/Pass.h                      |   13 +-
 llvm/include/llvm/PassAnalysisSupport.h       |   16 +-
 llvm/include/llvm/PassRegistry.h              |   17 +-
 llvm/include/llvm/PassSupport.h               |    3 +-
 llvm/include/llvm/Passes/OptimizationLevel.h  |   13 +-
 llvm/include/llvm/Passes/PassBuilder.h        |  191 +-
 llvm/include/llvm/Passes/PassPlugin.h         |    2 +-
 .../llvm/Passes/StandardInstrumentations.h    |   68 +-
 llvm/include/llvm/Support/CodeGen.h           |    9 +
 .../include/llvm/Target/CGPassBuilderOption.h |    3 +-
 .../llvm/Target/TargetLoweringObjectFile.h    |    3 +-
 llvm/include/llvm/Target/TargetMachine.h      |    5 +-
 llvm/include/llvm/Target/TargetOptions.h      |   11 +-
 .../llvm/TargetParser/SubtargetFeature.h      |    2 +-
 llvm/include/llvm/Transforms/IPO.h            |    2 +
 llvm/include/llvm/Transforms/IPO/Attributor.h |   41 +
 llvm/include/llvm/Transforms/Scalar/GVN.h     |   21 +-
 llvm/include/llvm/Transforms/Utils/Local.h    |    7 +-
 .../llvm/Transforms/Utils/PromoteMemToReg.h   |    2 +
 llvm/include/llvm/XRay/BlockIndexer.h         |    3 +-
 llvm/include/llvm/XRay/BlockPrinter.h         |    3 +-
 llvm/include/llvm/XRay/BlockVerifier.h        |    3 +-
 llvm/include/llvm/XRay/FDRRecordConsumer.h    |    5 +-
 llvm/include/llvm/XRay/FDRRecordProducer.h    |    3 +-
 llvm/include/llvm/XRay/FDRRecords.h           |   29 +-
 llvm/include/llvm/XRay/FDRTraceWriter.h       |    5 +-
 llvm/include/llvm/XRay/FileHeaderReader.h     |    5 +-
 llvm/include/llvm/XRay/InstrumentationMap.h   |   11 +-
 llvm/include/llvm/XRay/Profile.h              |   19 +-
 llvm/include/llvm/XRay/RecordPrinter.h        |    3 +-
 llvm/include/llvm/XRay/Trace.h                |    8 +-
 llvm/lib/Analysis/AliasSetTracker.cpp         |    3 -
 llvm/lib/Analysis/CallGraph.cpp               |    5 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |    4 +
 llvm/lib/Analysis/DemandedBits.cpp            |    3 +-
 llvm/lib/Analysis/IR2Vec.cpp                  |   13 +-
 llvm/lib/Analysis/InstructionSimplify.cpp     |   17 +
 llvm/lib/Analysis/Loads.cpp                   |    2 +-
 llvm/lib/Analysis/MemoryBuiltins.cpp          |    3 +
 .../lib/Analysis/MemoryDependenceAnalysis.cpp |    8 -
 llvm/lib/Analysis/MemoryLocation.cpp          |    4 +-
 llvm/lib/Analysis/ScalarEvolution.cpp         |   11 +-
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   12 +
 llvm/lib/Analysis/ValueTracking.cpp           |    6 -
 llvm/lib/BinaryFormat/Dwarf.cpp               |   12 +
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     |   28 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h       |    6 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   14 +-
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp |    1 -
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp  |    1 -
 llvm/lib/CodeGen/MachineBasicBlock.cpp        |    1 -
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   88 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |    7 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   27 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |    1 -
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |    5 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    2 +-
 llvm/lib/DebugInfo/DWARF/CMakeLists.txt       |    1 +
 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp  |  121 +
 llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp  |   94 -
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp  |    6 +-
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  |  445 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  346 +-
 llvm/lib/IR/BasicBlock.cpp                    |    1 -
 llvm/lib/IR/DebugInfo.cpp                     |    5 -
 llvm/lib/IR/DebugLoc.cpp                      |    4 +-
 llvm/lib/IR/DiagnosticInfo.cpp                |    4 +
 llvm/lib/IR/IRBuilder.cpp                     |    1 -
 llvm/lib/IR/Module.cpp                        |    7 +
 llvm/lib/IR/RuntimeLibcalls.cpp               |  220 +-
 llvm/lib/MC/MCAsmInfo.cpp                     |   18 +-
 llvm/lib/MC/MCExpr.cpp                        |    9 -
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp |    8 -
 llvm/lib/ObjCopy/MachO/MachOObject.cpp        |    4 -
 llvm/lib/ObjCopy/MachO/MachOObject.h          |    3 -
 llvm/lib/ObjCopy/MachO/MachOReader.cpp        |    4 -
 llvm/lib/Object/RelocationResolver.cpp        |    6 +-
 llvm/lib/Support/Unix/Process.inc             |   24 +-
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |    4 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   |    4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  120 +-
 .../Target/AArch64/AArch64StackTagging.cpp    |    3 +-
 .../Target/AArch64/AArch64TargetMachine.cpp   |    4 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   25 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |    3 +-
 .../Disassembler/AArch64Disassembler.cpp      |    3 +-
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp |    3 +-
 .../MCTargetDesc/AArch64MCTargetDesc.cpp      |    4 +-
 .../AArch64/TargetInfo/AArch64TargetInfo.cpp  |    4 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |    4 +-
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |   88 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |    2 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   25 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  106 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    2 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    3 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   73 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |    7 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |    4 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |    4 +-
 .../AMDGPU/MCA/AMDGPUCustomBehaviour.cpp      |    4 +-
 .../MCTargetDesc/AMDGPUMCTargetDesc.cpp       |    4 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   47 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |    5 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   49 +
 .../AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp    |    4 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |    1 -
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   12 +-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |    4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  182 +-
 llvm/lib/Target/ARM/ARMTargetMachine.cpp      |    3 +-
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |    2 +-
 .../ARM/Disassembler/ARMDisassembler.cpp      |    3 +-
 .../ARM/MCTargetDesc/ARMMCTargetDesc.cpp      |    3 +-
 .../Target/ARM/TargetInfo/ARMTargetInfo.cpp   |    5 +-
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp         |    8 +-
 llvm/lib/Target/AVR/AVRMCInstLower.cpp        |   14 +-
 llvm/lib/Target/AVR/AVRTargetMachine.cpp      |    3 +-
 .../lib/Target/AVR/AsmParser/AVRAsmParser.cpp |   17 +-
 .../AVR/Disassembler/AVRDisassembler.cpp      |    5 +-
 .../AVR/MCTargetDesc/AVRELFObjectWriter.cpp   |   32 +-
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp  |  196 +
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.h    |   33 +
 .../AVR/MCTargetDesc/AVRMCCodeEmitter.cpp     |    2 +-
 .../AVR/MCTargetDesc/AVRMCELFStreamer.cpp     |   22 +-
 .../AVR/MCTargetDesc/AVRMCELFStreamer.h       |    9 +-
 .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp |  194 +-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h  |   28 +-
 .../AVR/MCTargetDesc/AVRMCTargetDesc.cpp      |    3 +-
 .../Target/AVR/TargetInfo/AVRTargetInfo.cpp   |    4 +-
 .../lib/Target/BPF/AsmParser/BPFAsmParser.cpp |    4 +-
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp         |    4 +-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp      |    3 +-
 .../BPF/Disassembler/BPFDisassembler.cpp      |    5 +-
 .../BPF/MCTargetDesc/BPFMCTargetDesc.cpp      |    4 +-
 .../Target/BPF/TargetInfo/BPFTargetInfo.cpp   |    4 +-
 .../Target/DirectX/DXILFinalizeLinkage.cpp    |    4 +-
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp |   44 +-
 llvm/lib/Target/DirectX/DXILPrepare.cpp       |   44 +-
 .../Hexagon/AsmParser/HexagonAsmParser.cpp    |    4 +-
 .../Disassembler/HexagonDisassembler.cpp      |    4 +-
 llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp |    4 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |    2 +-
 .../Target/Hexagon/HexagonTargetMachine.cpp   |    4 +-
 .../MCTargetDesc/HexagonMCTargetDesc.cpp      |    4 +-
 .../Hexagon/TargetInfo/HexagonTargetInfo.cpp  |    4 +-
 .../Target/Lanai/AsmParser/LanaiAsmParser.cpp |    4 +-
 .../Lanai/Disassembler/LanaiDisassembler.cpp  |    4 +-
 llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/Lanai/LanaiTargetMachine.cpp  |    3 +-
 .../Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp  |    4 +-
 .../Lanai/TargetInfo/LanaiTargetInfo.cpp      |    4 +-
 .../AsmParser/LoongArchAsmParser.cpp          |    4 +-
 .../Disassembler/LoongArchDisassembler.cpp    |    4 +-
 .../Target/LoongArch/LoongArchAsmPrinter.cpp  |    4 +-
 .../LoongArch/LoongArchTargetMachine.cpp      |    4 +-
 .../MCTargetDesc/LoongArchAsmBackend.cpp      |   47 +-
 .../MCTargetDesc/LoongArchAsmBackend.h        |    4 +
 .../MCTargetDesc/LoongArchMCTargetDesc.cpp    |    3 +-
 .../TargetInfo/LoongArchTargetInfo.cpp        |    4 +-
 .../MSP430/AsmParser/MSP430AsmParser.cpp      |    4 +-
 .../Disassembler/MSP430Disassembler.cpp       |    4 +-
 .../MCTargetDesc/MSP430MCTargetDesc.cpp       |    4 +-
 llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp   |    4 +-
 .../lib/Target/MSP430/MSP430TargetMachine.cpp |    3 +-
 .../MSP430/TargetInfo/MSP430TargetInfo.cpp    |    4 +-
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   |   61 +-
 .../Mips/Disassembler/MipsDisassembler.cpp    |    3 +-
 .../Target/Mips/MCTargetDesc/CMakeLists.txt   |    1 -
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |    1 +
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       |    8 +
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |    3 +-
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |    2 +-
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   |   39 -
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h |   36 -
 .../Mips/MCTargetDesc/MipsMCTargetDesc.cpp    |    3 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   21 +-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |    6 +-
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      |   10 +-
 llvm/lib/Target/Mips/MipsTargetMachine.cpp    |    3 +-
 llvm/lib/Target/Mips/MipsTargetObjectFile.cpp |    2 +-
 .../Target/Mips/TargetInfo/MipsTargetInfo.cpp |    4 +-
 .../NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp  |    4 +-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  450 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h     |    3 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |    4 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   94 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |    3 +-
 .../NVPTX/TargetInfo/NVPTXTargetInfo.cpp      |    4 +-
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |    4 +-
 .../PowerPC/Disassembler/PPCDisassembler.cpp  |    4 +-
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp  |    4 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp   |    5 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp  |    4 +-
 .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp  |    4 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |    4 +-
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |    8 +-
 .../Target/RISCV/MCA/RISCVCustomBehaviour.cpp |    4 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    |    6 +-
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |    4 +-
 .../RISCV/MCTargetDesc/RISCVFixupKinds.h      |    2 +-
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |    2 +-
 .../RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp  |    4 +-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   18 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   77 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |    9 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td |   31 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |    5 +
 llvm/lib/Target/RISCV/RISCVProcessors.td      |    8 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    8 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   75 +-
 .../RISCV/TargetInfo/RISCVTargetInfo.cpp      |    4 +-
 .../SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp  |    4 +-
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |    2 +
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |    3 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |    3 +-
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp  |    3 +-
 .../SPIRV/TargetInfo/SPIRVTargetInfo.cpp      |    4 +-
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp |   15 +-
 .../Sparc/Disassembler/SparcDisassembler.cpp  |    5 +-
 .../MCTargetDesc/SparcELFObjectWriter.cpp     |    4 +-
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.cpp     |    1 -
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.h       |    5 +
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |    8 +-
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.cpp |    2 +-
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.h   |   32 -
 .../Sparc/MCTargetDesc/SparcMCTargetDesc.cpp  |    4 +-
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp     |   10 +-
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |    2 +-
 llvm/lib/Target/Sparc/SparcTargetMachine.cpp  |    3 +-
 .../Target/Sparc/SparcTargetObjectFile.cpp    |    2 +-
 .../Sparc/TargetInfo/SparcTargetInfo.cpp      |    4 +-
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |    4 +-
 .../Disassembler/SystemZDisassembler.cpp      |    4 +-
 .../MCTargetDesc/SystemZMCTargetDesc.cpp      |    4 +-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp |    4 +-
 .../Target/SystemZ/SystemZTargetMachine.cpp   |    4 +-
 .../SystemZ/TargetInfo/SystemZTargetInfo.cpp  |    4 +-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp  |    3 +-
 .../Target/VE/Disassembler/VEDisassembler.cpp |    4 +-
 .../Target/VE/MCTargetDesc/VEMCTargetDesc.cpp |    3 +-
 .../lib/Target/VE/TargetInfo/VETargetInfo.cpp |    3 +-
 llvm/lib/Target/VE/VEAsmPrinter.cpp           |    3 +-
 llvm/lib/Target/VE/VETargetMachine.cpp        |    3 +-
 .../AsmParser/WebAssemblyAsmParser.cpp        |    4 +-
 .../Disassembler/WebAssemblyDisassembler.cpp  |    3 +-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp  |    4 +-
 .../TargetInfo/WebAssemblyTargetInfo.cpp      |    4 +-
 .../WebAssembly/WebAssemblyAsmPrinter.cpp     |    4 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |    4 +-
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |   26 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  264 +-
 llvm/lib/Target/X86/X86WinEHUnwindV2.cpp      |  152 +-
 .../XCore/Disassembler/XCoreDisassembler.cpp  |    4 +-
 .../XCore/MCTargetDesc/XCoreMCTargetDesc.cpp  |    4 +-
 .../XCore/TargetInfo/XCoreTargetInfo.cpp      |    4 +-
 llvm/lib/Target/XCore/XCoreAsmPrinter.cpp     |    4 +-
 llvm/lib/Target/XCore/XCoreTargetMachine.cpp  |    3 +-
 .../Xtensa/AsmParser/XtensaAsmParser.cpp      |   29 +-
 .../Disassembler/XtensaDisassembler.cpp       |   56 +-
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       |   89 +-
 .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h  |    9 +-
 llvm/lib/Target/Xtensa/XtensaFeatures.td      |   40 +
 llvm/lib/Target/Xtensa/XtensaInstrInfo.td     |  115 +
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.td  |  111 +-
 llvm/lib/Target/Xtensa/XtensaSubtarget.h      |    9 +
 llvm/lib/TargetParser/AArch64TargetParser.cpp |    5 +-
 .../AggressiveInstCombine.cpp                 |    4 +-
 llvm/lib/Transforms/IPO/Attributor.cpp        |    4 +-
 .../Transforms/IPO/AttributorAttributes.cpp   |  339 +
 llvm/lib/Transforms/IPO/ElimAvailExtern.cpp   |   36 +-
 .../Transforms/IPO/FunctionSpecialization.cpp |    2 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |   14 -
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp    |   44 +-
 .../lib/Transforms/IPO/SampleProfileProbe.cpp |    3 +-
 .../InstCombine/InstCombineAddSub.cpp         |   28 +
 .../InstCombine/InstCombineCalls.cpp          |   29 +-
 .../InstCombine/InstCombineCompares.cpp       |   24 +
 .../InstCombineLoadStoreAlloca.cpp            |    1 -
 .../InstCombine/InstructionCombining.cpp      |   12 +-
 .../Instrumentation/GCOVProfiling.cpp         |    8 -
 .../Instrumentation/ThreadSanitizer.cpp       |    3 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp           |   15 +-
 .../Transforms/Scalar/ConstantHoisting.cpp    |    1 -
 llvm/lib/Transforms/Scalar/GVN.cpp            |   96 +-
 llvm/lib/Transforms/Scalar/GVNHoist.cpp       |    3 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |    4 -
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   11 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |    2 +-
 .../Scalar/LowerMatrixIntrinsics.cpp          |   84 +-
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |    1 -
 .../Scalar/SpeculativeExecution.cpp           |    8 +-
 .../Scalar/TailRecursionElimination.cpp       |   17 +-
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |    3 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   51 +-
 llvm/lib/Transforms/Utils/Debugify.cpp        |   18 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp       |    7 -
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |    9 +-
 llvm/lib/Transforms/Utils/Local.cpp           |   12 +-
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |    5 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |   44 +-
 .../Utils/ScalarEvolutionExpander.cpp         |   15 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  159 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |    2 -
 .../Transforms/Vectorize/LoopVectorize.cpp    |   31 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  222 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   20 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   22 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  157 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  165 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |    5 -
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   |   21 +-
 .../Analysis/CostModel/AMDGPU/canonicalize.ll |   24 +-
 .../Analysis/CostModel/AMDGPU/copysign.ll     |   60 +-
 .../Analysis/CostModel/AMDGPU/maximumnum.ll   |  506 ++
 .../Analysis/CostModel/AMDGPU/minimumnum.ll   |  506 ++
 .../AMDGPU/special-argument-intrinsics.ll     |  202 +
 .../Analysis/CostModel/RISCV/cast-half.ll     |   16 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    | 6262 ++++++-----------
 llvm/test/Analysis/CostModel/RISCV/cmp.ll     |  490 +-
 .../Analysis/CostModel/RISCV/rvv-shuffle.ll   |   68 +-
 .../CostModel/RISCV/shuffle-reverse.ll        |   52 +-
 .../AArch64/aarch64-bitwisenot-fold.ll        |   98 +
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll |   33 +-
 .../neon-partial-reduce-dot-product.ll        | 1303 ++--
 .../replace-with-veclib-libmvec-scalable.ll   |  579 ++
 .../AArch64/replace-with-veclib-libmvec.ll    |  577 ++
 llvm/test/CodeGen/AArch64/sve-expand-div.ll   |    1 +
 .../sve-fixed-length-partial-reduce.ll        |    6 +-
 .../AArch64/sve-fixed-length-sdiv-pow2.ll     |    1 +
 .../AArch64/sve-partial-reduce-dot-product.ll | 1926 ++---
 .../AArch64/sve-partial-reduce-wide-add.ll    |  290 +-
 llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll    |    1 +
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   42 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll  |  141 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll  |  165 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll  |  141 +-
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll  |  165 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        | 1442 ++--
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        | 1442 ++--
 .../CodeGen/AMDGPU/promote-alloca-structs.ll  |  286 -
 .../finalize-linkage-remove-dead-lib.ll       |   77 +-
 .../DirectX/finalize-linkage-remove-dead.ll   |   46 +-
 llvm/test/CodeGen/DirectX/finalize_linkage.ll |   25 +-
 llvm/test/CodeGen/DirectX/flatten-array.ll    |   70 +
 .../DirectX/llc-vector-load-scalarize.ll      |   64 +-
 .../DirectX/noop_bitcast_global_array_type.ll |   53 +
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll |   10 +-
 llvm/test/CodeGen/NVPTX/bug26185-2.ll         |   22 +-
 llvm/test/CodeGen/NVPTX/bug26185.ll           |   73 +-
 llvm/test/CodeGen/NVPTX/i1-ext-load.ll        |    4 +-
 llvm/test/CodeGen/NVPTX/ldu-ldg.ll            |    8 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |   19 +-
 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll |  588 ++
 llvm/test/CodeGen/PowerPC/aix-cc-byval.ll     | 1301 ++--
 .../PowerPC/aix-vector-vararg-caller.ll       |  227 +-
 .../PowerPC/aix-vector-vararg-fixed-caller.ll |  137 +-
 llvm/test/CodeGen/PowerPC/f128-arith.ll       |  445 ++
 .../RISCV/GlobalISel/irtranslator/fallback.ll |    4 +-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |    2 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |    2 +-
 llvm/test/CodeGen/RISCV/attributes.ll         |   14 +-
 llvm/test/CodeGen/RISCV/features-info.ll      |    1 +
 .../CodeGen/RISCV/rvv/extract-subvector.ll    |   16 +-
 .../CodeGen/RISCV/rvv/get_vector_length.ll    |   24 +-
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |   38 +-
 .../CodeGen/RISCV/rvv/legalize-load-sdnode.ll |   12 +-
 .../RISCV/rvv/legalize-store-sdnode.ll        |    6 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  283 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll       |   12 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |    9 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll  |    3 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     |   20 +-
 .../RISCV/rvv/undef-earlyclobber-chain.ll     |    2 +-
 llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll   |   48 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |  186 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  173 +-
 .../RISCV/rvv/vector-interleave-store.ll      |    9 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    |  817 +--
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll  |   12 +-
 .../RISCV/rvv/vp-vector-interleaved-access.ll |  100 +-
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |   42 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   26 +-
 llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll    |  798 +++
 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll     |  958 +++
 .../CodeGen/SPIRV/constant/spec-constant.ll   |   73 +
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |    2 +-
 llvm/test/CodeGen/X86/dpbusd.ll               |   12 +-
 llvm/test/CodeGen/X86/dpbusd_const.ll         |   16 +-
 llvm/test/CodeGen/X86/fixup-blend.ll          |  208 +-
 llvm/test/CodeGen/X86/horizontal-sum.ll       |   28 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll |    6 +-
 .../CodeGen/X86/vector-reduce-add-mask.ll     |    2 +-
 .../CodeGen/X86/vector-reduce-add-zext.ll     |    4 +-
 llvm/test/CodeGen/X86/vector-reduce-add.ll    |   27 +-
 .../CodeGen/X86/win64-eh-unwindv2-errors.mir  |  318 +
 .../win64-eh-unwindv2-too-many-epilogs.mir    |   94 +
 .../CodeGen/X86/zero_extend_vector_inreg.ll   |   42 +-
 .../zero_extend_vector_inreg_of_broadcast.ll  |    6 +-
 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml |   86 +
 ...DW_AT_object_pointer-non-standard-index.ll |   79 +
 .../DebugInfo/X86/DW_AT_object_pointer.ll     |   24 +-
 .../MC/Disassembler/Xtensa/coprocessor.txt    |   10 +
 llvm/test/MC/Disassembler/Xtensa/debug.txt    |   62 +
 .../test/MC/Disassembler/Xtensa/exception.txt |   42 +
 .../MC/Disassembler/Xtensa/highinterrupts.txt |   82 +
 .../test/MC/Disassembler/Xtensa/interrupt.txt |   26 +
 llvm/test/MC/Disassembler/Xtensa/prid.txt     |   10 +
 llvm/test/MC/Disassembler/Xtensa/timer.txt    |   22 +
 llvm/test/MC/LoongArch/Misc/cfi-advance.s     |   12 +
 .../test/MC/LoongArch/Relocations/fde-reloc.s |    7 +-
 llvm/test/MC/LoongArch/Relocations/sub-expr.s |   91 +-
 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s      |   27 +
 llvm/test/MC/Xtensa/Core/processor-control.s  |    5 +
 llvm/test/MC/Xtensa/coprocessor.s             |   20 +
 llvm/test/MC/Xtensa/debug-invalid.s           |    9 +
 llvm/test/MC/Xtensa/debug.s                   |  190 +
 llvm/test/MC/Xtensa/exception.s               |  100 +
 llvm/test/MC/Xtensa/highinterrupts.s          |  280 +
 llvm/test/MC/Xtensa/interrupt.s               |   60 +
 llvm/test/MC/Xtensa/prid.s                    |   20 +
 llvm/test/MC/Xtensa/timer.s                   |   65 +
 .../Attributor/AMDGPU/tag-invariant-loads.ll  |  382 +
 .../Attributor/dereferenceable-1.ll           |    1 -
 .../Attributor/value-simplify-local-remote.ll |   22 +-
 .../DeadStoreElimination/trivial-dse-calls.ll |   32 +
 .../convert-global-variables-to-local.ll      |   21 +
 llvm/test/Transforms/GVN/opt-remarks.ll       |   22 +
 .../simplify-icmp-operands-order.ll           |  190 +
 .../InstCombine/AMDGPU/image-d16.ll           |  118 +
 llvm/test/Transforms/InstCombine/add.ll       |  261 +
 .../Transforms/InstCombine/icmp-subadd.ll     |  111 +
 .../InstSimplify/ConstProp/loads.ll           |   36 +
 .../Transforms/InstSimplify/vp-reverse.ll     |   19 +-
 llvm/test/Transforms/LICM/funclet.ll          |    8 +-
 .../AArch64/postidx-load.ll                   |  189 +
 .../LoopUnroll/peel-last-iteration-minmax.ll  |   48 +-
 .../AArch64/clamped-trip-count.ll             |    8 +-
 .../AArch64/conditional-branches-cost.ll      |    4 +-
 .../AArch64/divs-with-scalable-vfs.ll         |    4 +-
 .../AArch64/epilog-iv-select-cmp.ll           |   28 +-
 .../LoopVectorize/AArch64/optsize_minsize.ll  |    8 +-
 .../AArch64/simple_early_exit.ll              |    2 +-
 ...sve-epilog-vect-no-remaining-iterations.ll |  146 +
 .../AArch64/sve-inductions-unusual-types.ll   |    8 +-
 .../AArch64/sve-interleaved-accesses.ll       |   22 +-
 .../sve-interleaved-masked-accesses.ll        |   80 +-
 ...eave-to-widen-memory-remove-loop-region.ll |    2 +-
 .../AArch64/veclib-function-calls.ll          | 1035 +++
 .../AArch64/veclib-intrinsic-calls.ll         |  735 ++
 .../LoopVectorize/ARM/optsize_minsize.ll      |    2 +-
 .../first-order-recurrence-scalable-vf1.ll    |   57 +-
 .../RISCV/interleaved-masked-access.ll        |   60 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |    2 +-
 ...ruction-or-drop-poison-generating-flags.ll |    2 +-
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |    2 +-
 .../RISCV/riscv-vector-reverse.ll             |   20 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |    2 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |   10 +-
 ...ectorize-force-tail-with-evl-interleave.ll |    6 +-
 .../LoopVectorize/X86/constant-fold.ll        |    4 +-
 .../X86/drop-poison-generating-flags.ll       |   24 +-
 .../LoopVectorize/X86/induction-costs.ll      |    2 +-
 .../LoopVectorize/X86/interleave-cost.ll      |    4 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |    2 +-
 .../LoopVectorize/X86/scatter_crash.ll        |   16 +-
 .../epilog-vectorization-any-of-reductions.ll |    4 +-
 ...irst-order-recurrence-dead-instructions.ll |  270 +
 .../first-order-recurrence-scalable-vf1.ll    |   90 +-
 .../LoopVectorize/first-order-recurrence.ll   |  277 +-
 ...eref-pred-poison-ub-ops-feeding-pointer.ll |   10 +-
 .../LoopVectorize/pointer-induction.ll        |  138 +
 .../LoopVectorize/reduction-inloop-pred.ll    |   36 +-
 .../scalable-first-order-recurrence.ll        |    8 +-
 .../LoopVectorize/scalable-inductions.ll      |    8 +-
 .../LoopVectorize/scalable-iv-outside-user.ll |    6 +-
 .../single_early_exit_live_outs.ll            |    2 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |    2 +-
 .../LoopVectorize/vplan-iv-transforms.ll      |   65 +
 .../Inputs/import-thinlto-funcs.yaml          |    5 +
 .../LowerTypeTests/blockaddress-2.ll          |    2 +-
 .../LowerTypeTests/cfi-coff-comdat-rename.ll  |    2 +
 .../LowerTypeTests/cfi-icall-alias.ll         |    2 +-
 .../Transforms/LowerTypeTests/export-alias.ll |    4 +-
 .../Transforms/LowerTypeTests/export-icall.ll |   12 +-
 .../LowerTypeTests/function-disjoint.ll       |    4 +-
 .../Transforms/LowerTypeTests/function.ll     |   15 +-
 .../LowerTypeTests/icall-branch-funnel.ll     |    4 +-
 .../test/Transforms/LowerTypeTests/pr37625.ll |    2 +-
 .../test/Transforms/LowerTypeTests/section.ll |    2 +-
 .../Transforms/PhaseOrdering/X86/fmaddsub.ll  |  720 ++
 .../test/Transforms/PhaseOrdering/X86/hadd.ll |   25 +-
 .../test/Transforms/PhaseOrdering/X86/hsub.ll |   25 +-
 .../SLPVectorizer/AMDGPU/external-shuffle.ll  |  128 +-
 .../SLPVectorizer/AMDGPU/slp-v2f16.ll         |   52 +-
 .../SystemZ/non-power-2-subvector-extract.ll  |   87 +
 .../extractelement-single-use-many-nodes.ll   |   11 +-
 .../X86/vec_list_bias-inseltpoison.ll         |   25 +-
 .../Transforms/SLPVectorizer/isCommutative.ll |   34 +
 llvm/test/Transforms/Util/add-TLI-mappings.ll |   28 +-
 .../tools/llvm-dwarfdump/X86/statistics.ll    |    4 +-
 .../RISCV/unsupported-opcode.test             |    3 +
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s  |    2 +-
 .../MachO/strip-with-encryption-info.test     |  156 +-
 llvm/tools/llvm-exegesis/lib/Target.cpp       |    2 +
 llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h   |    5 +-
 .../CodeGen/SelectionDAGPatternMatchTest.cpp  |    4 +
 llvm/unittests/Frontend/CMakeLists.txt        |    1 +
 .../Frontend/HLSLRootSignatureDumpTest.cpp    |  189 +
 .../Frontend/HLSLRootSignatureRangesTest.cpp  |  177 +
 llvm/unittests/Object/ELFObjectFileTest.cpp   |   11 -
 llvm/unittests/Target/AArch64/CMakeLists.txt  |    1 +
 .../unittests/Target/LoongArch/CMakeLists.txt |    1 +
 llvm/unittests/Target/RISCV/CMakeLists.txt    |    1 +
 llvm/unittests/Target/SPIRV/CMakeLists.txt    |    1 +
 llvm/unittests/Target/VE/CMakeLists.txt       |    1 +
 .../Target/WebAssembly/CMakeLists.txt         |    1 +
 .../TargetParser/RISCVISAInfoTest.cpp         |   17 +-
 .../TargetParser/TargetParserTest.cpp         |   16 +
 llvm/utils/TableGen/AsmMatcherEmitter.cpp     |    9 +-
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |   18 +-
 .../TableGen/Common/CodeGenInstruction.cpp    |    2 +-
 .../TableGen/Common/CodeGenInstruction.h      |    2 +-
 .../TableGen/Common/CodeGenRegisters.cpp      |    4 +-
 llvm/utils/TableGen/Common/CodeGenRegisters.h |   10 +-
 llvm/utils/TableGen/Common/CodeGenSchedule.h  |   14 +-
 .../Common/GlobalISel/GlobalISelMatchTable.h  |   18 +-
 .../utils/TableGen/Common/PredicateExpander.h |    9 +-
 llvm/utils/TableGen/DAGISelMatcherGen.cpp     |    9 +-
 llvm/utils/TableGen/DecoderEmitter.cpp        |  119 +-
 llvm/utils/TableGen/FastISelEmitter.cpp       |    4 +-
 llvm/utils/gn/secondary/lldb/test/BUILD.gn    |    1 +
 .../llvm/include/llvm/Config/BUILD.gn         |    6 +-
 .../llvm/lib/DebugInfo/DWARF/BUILD.gn         |    1 +
 .../lib/Target/Mips/MCTargetDesc/BUILD.gn     |    1 -
 .../llvm/unittests/Frontend/BUILD.gn          |    1 +
 mlir/docs/Dialects/Vector.md                  |    3 +
 .../Dialect/Affine/Transforms/Transforms.h    |   14 +
 .../Dialect/ControlFlow/IR/ControlFlowOps.td  |   34 +-
 .../mlir/Dialect/EmitC/IR/CMakeLists.txt      |    6 +
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.h    |    2 +-
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td   |  109 +-
 .../mlir/Dialect/EmitC/IR/EmitCInterfaces.h   |   31 +
 .../mlir/Dialect/EmitC/IR/EmitCInterfaces.td  |   48 +
 .../mlir/Dialect/EmitC/IR/EmitCTraits.h       |   30 -
 .../mlir/Dialect/LLVMIR/LLVMInterfaces.td     |   36 -
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |   59 +-
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       |   40 +
 .../include/mlir/Dialect/Tensor/Utils/Utils.h |    2 +-
 .../mlir/Dialect/Transform/IR/TransformOps.td |    5 +-
 .../mlir/Interfaces/ControlFlowInterfaces.h   |   20 +
 .../mlir/Interfaces/ControlFlowInterfaces.td  |  112 +
 mlir/include/mlir/Query/Matcher/Marshallers.h |   61 +
 mlir/include/mlir/Query/Matcher/MatchFinder.h |    4 +-
 .../mlir/Query/Matcher/MatchersInternal.h     |  116 +-
 .../mlir/Query/Matcher/SliceMatchers.h        |  104 +-
 .../include/mlir/Query/Matcher/VariantValue.h |   11 +-
 .../mlir/Target/LLVMIR/ModuleTranslation.h    |    2 +-
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |    7 +-
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  |   13 +
 .../Transforms/AffineExpandIndexOps.cpp       |  218 +-
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           |   18 +-
 .../EmitC/Transforms/FormExpressions.cpp      |    2 +-
 .../Dialect/EmitC/Transforms/Transforms.cpp   |    3 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |   11 +-
 .../Dialect/SPIRV/IR/SPIRVCanonicalization.td |    8 +
 .../SPIRV/IR/SPIRVGLCanonicalization.cpp      |    4 +-
 mlir/lib/Dialect/Tensor/Utils/Utils.cpp       |    3 +-
 .../lib/Dialect/Transform/IR/TransformOps.cpp |  166 +-
 .../Transforms/VectorTransferOpTransforms.cpp |   28 +-
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  |  297 +-
 .../Transforms/XeGPUWgToSgDistribute.cpp      |   89 +-
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |   46 +
 mlir/lib/Query/Matcher/CMakeLists.txt         |    1 +
 mlir/lib/Query/Matcher/MatchersInternal.cpp   |   33 +
 mlir/lib/Query/Matcher/RegistryManager.cpp    |    7 +-
 mlir/lib/Query/Matcher/VariantValue.cpp       |   52 +
 mlir/lib/Query/Query.cpp                      |    5 +
 mlir/lib/Target/Cpp/TranslateToCpp.cpp        |    6 +-
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |   11 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |    3 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   11 +-
 .../Target/SPIRV/Serialization/Serializer.cpp |   13 +
 .../mlir/dialects/transform/__init__.py       |   41 +-
 mlir/test/CMakeLists.txt                      |    6 +
 .../Conversion/ControlFlowToLLVM/branch.mlir  |   14 +
 .../MemRefToLLVM/memref-to-llvm.mlir          |   10 +-
 mlir/test/Dialect/ControlFlow/invalid.mlir    |   36 +
 mlir/test/Dialect/ControlFlow/ops.mlir        |   10 +
 mlir/test/Dialect/EmitC/invalid_ops.mlir      |   22 +
 mlir/test/Dialect/EmitC/ops.mlir              |   16 +-
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        |   66 +
 .../SPIRV/Transforms/gl-canonicalize.mlir     |   22 +
 .../Transform/test-pass-application.mlir      |  145 +-
 .../scalar-vector-transfer-to-memref.mlir     |   30 +
 mlir/test/Dialect/XeGPU/invalid.mlir          |   96 +-
 mlir/test/Dialect/XeGPU/layout.mlir           |   49 +
 mlir/test/Dialect/XeGPU/ops.mlir              |  355 +-
 .../XeGPU/subgroup-map-propagation.mlir       |   72 +-
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |   12 +-
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  |  271 +-
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        |  164 +
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |   32 +-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   |   40 +-
 .../CPU/ArmSVE/vector-contract-i8mm.mlir      |  463 ++
 .../LLVMIR/Import/metadata-profiling.ll       |   13 +-
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |   16 +
 mlir/test/Target/LLVMIR/llvmir.mlir           |   26 -
 .../LLVMIR/omptarget-debug-loop-loc.mlir      |   66 +
 mlir/test/Target/LLVMIR/omptarget-depend.mlir |    3 +-
 .../Target/LLVMIR/omptarget-nowait-llvm.mlir  |   45 +-
 mlir/test/Target/LLVMIR/omptarget-nowait.mlir |   70 +
 .../LLVMIR/omptargetdata-nowait-llvm.mlir     |   45 +-
 .../Target/LLVMIR/openmp-task-charbox.mlir    |   87 +
 mlir/test/Target/SPIRV/constant.mlir          |    5 +-
 mlir/test/Target/SPIRV/gl-ops.mlir            |    4 +
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |   44 +-
 mlir/test/lit.cfg.py                          |    1 +
 mlir/test/lit.local.cfg                       |    7 +
 mlir/test/lit.site.cfg.py.in                  |    4 +-
 ...ex-test.mlir => backward-slice-union.mlir} |   13 +-
 .../forward-slice-by-predicate.mlir           |   27 +
 .../mlir-query/logical-operator-test.mlir     |   11 +
 .../mlir-query/slice-function-extraction.mlir |   29 +
 mlir/test/python/dialects/transform.py        |   58 +-
 mlir/tools/mlir-query/mlir-query.cpp          |   14 +-
 mlir/utils/vscode/package-lock.json           |  127 +-
 mlir/utils/vscode/package.json                |    2 +
 offload/cmake/caches/AMDGPULibcBot.cmake      |   20 +
 .../llvm-project-overlay/lld/BUILD.bazel      |    6 -
 .../llvm-project-overlay/llvm/config.bzl      |    1 +
 .../llvm/include/llvm/Config/config.h         |    3 +
 .../llvm/include/llvm/Config/llvm-config.h    |    4 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |    2 +
 .../mlir/test/BUILD.bazel                     |    1 +
 utils/bazel/llvm_configs/config.h.cmake       |    3 +
 utils/bazel/llvm_configs/llvm-config.h.cmake  |    4 +-
 1101 files changed, 39728 insertions(+), 20330 deletions(-)
 create mode 100644 clang-tools-extra/clangd/test/positionencoding.test
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
 create mode 100644 clang/lib/Headers/cuda_wrappers/bits/c++config.h
 create mode 100644 clang/test/CIR/CodeGen/builtin_printf.cpp
 create mode 100644 clang/test/CIR/CodeGen/string-literals.cpp
 create mode 100644 clang/test/CIR/Transforms/vector-cmp-fold.cir
 create mode 100644 clang/test/CodeGen/aarch64-always-inline-feature-bug.c
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/aarch64-gnutools.c
 create mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 create mode 100644 clang/test/Driver/aarch64-toolchain.c
 create mode 100644 clang/test/Driver/arm-gnutools.c
 create mode 100644 clang/test/Driver/arm-toolchain-extra.c
 create mode 100644 clang/test/Driver/arm-toolchain.c
 create mode 100644 clang/test/Driver/check-no-multlib-warning.c
 create mode 100644 clang/test/Driver/ignored-pch.cpp
 create mode 100644 clang/test/Modules/preferred_name_header_unit.cpp
 create mode 100644 clang/test/PCH/Inputs/ignored-pch.h
 create mode 100644 clang/test/PCH/ignored-pch.c
 create mode 100644 clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
 create mode 100644 clang/test/Sema/gh87867.c
 create mode 100644 clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
 create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 create mode 100644 flang-rt/lib/runtime/work-queue.cpp
 create mode 100644 flang/test/Fir/cfg-conversion-if.fir
 create mode 100644 flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-teams-private.f90
 create mode 100644 flang/test/Lower/OpenMP/copyprivate5.f90
 create mode 100644 flang/test/Lower/OpenMP/target-parallel-private.f90
 create mode 100644 flang/test/Lower/OpenMP/target-teams-private.f90
 create mode 100644 flang/test/Lower/OpenMP/taskgroup02.f90
 create mode 100644 flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
 create mode 100644 flang/test/Semantics/OpenMP/sections-goto.f90
 create mode 100644 flang/test/Semantics/indirect01.f90
 create mode 100644 flang/test/Semantics/indirect02.f90
 create mode 100644 flang/test/Semantics/modfile79.F90
 create mode 100644 flang/test/Semantics/typeinfo12.f90
 delete mode 100644 libc/src/__support/wchar/utf_ret.h
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/test/src/__support/wchar/utf32_to_8_test.cpp
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp
 create mode 100644 lld/test/MachO/reexport-with-symlink.s
 create mode 100644 lldb/test/API/commands/plugin/TestPlugin.py
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
 create mode 100644 llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
 create mode 100644 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
 delete mode 100644 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
 create mode 100644 llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
 create mode 100644 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
 create mode 100644 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
 create mode 100644 llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/debug.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/exception.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/interrupt.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/prid.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/timer.txt
 create mode 100644 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
 create mode 100644 llvm/test/MC/Xtensa/coprocessor.s
 create mode 100644 llvm/test/MC/Xtensa/debug-invalid.s
 create mode 100644 llvm/test/MC/Xtensa/debug.s
 create mode 100644 llvm/test/MC/Xtensa/exception.s
 create mode 100644 llvm/test/MC/Xtensa/highinterrupts.s
 create mode 100644 llvm/test/MC/Xtensa/interrupt.s
 create mode 100644 llvm/test/MC/Xtensa/prid.s
 create mode 100644 llvm/test/MC/Xtensa/timer.s
 create mode 100644 llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
 create mode 100644 llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
 create mode 100644 llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-subadd.ll
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
 create mode 100644 llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/isCommutative.ll
 create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
 create mode 100644 llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
 create mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
 create mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
 delete mode 100644 mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h
 create mode 100644 mlir/lib/Query/Matcher/MatchersInternal.cpp
 create mode 100644 mlir/test/Dialect/XeGPU/layout.mlir
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-nowait.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
 create mode 100644 mlir/test/lit.local.cfg
 rename mlir/test/mlir-query/{complex-test.mlir => backward-slice-union.mlir} (71%)
 create mode 100644 mlir/test/mlir-query/forward-slice-by-predicate.mlir
 create mode 100644 mlir/test/mlir-query/logical-operator-test.mlir
 create mode 100644 mlir/test/mlir-query/slice-function-extraction.mlir
 create mode 100644 offload/cmake/caches/AMDGPULibcBot.cmake

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 162161ff13fb0..2f8d5745668d9 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -777,6 +777,10 @@ backend:NVPTX:
   - 'llvm/**/*nvptx*/**'
   - 'llvm/**/*NVPTX*/**'
 
+backend:MIPS:
+  - '**/*mips*'
+  - '**/*Mips*'
+
 backend:RISC-V:
   - clang/**/*riscv*
   - clang/**/*RISCV*
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 3f07a6dc03a4f..96969cf53baca 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -99,24 +99,28 @@ class DataAggregator : public DataReader {
     uint64_t Addr;
   };
 
+  /// Container for the unit of branch data.
+  /// Backwards compatible with legacy use for branches and fall-throughs:
+  /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
+  ///   contains fall-through data,
+  /// - if \p To is BR_ONLY, the trace only contains branch data.
   struct Trace {
+    static constexpr const uint64_t EXTERNAL = 0ULL;
+    static constexpr const uint64_t BR_ONLY = -1ULL;
+    static constexpr const uint64_t FT_ONLY = -1ULL;
+    static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+
+    uint64_t Branch;
     uint64_t From;
     uint64_t To;
-    Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
-    bool operator==(const Trace &Other) const {
-      return From == Other.From && To == Other.To;
-    }
+    auto tie() const { return std::tie(Branch, From, To); }
+    bool operator==(const Trace &Other) const { return tie() == Other.tie(); }
+    bool operator<(const Trace &Other) const { return tie() < Other.tie(); }
   };
+  friend raw_ostream &operator<<(raw_ostream &OS, const Trace &);
 
   struct TraceHash {
-    size_t operator()(const Trace &L) const {
-      return std::hash<uint64_t>()(L.From << 32 | L.To);
-    }
-  };
-
-  struct FTInfo {
-    uint64_t InternCount{0};
-    uint64_t ExternCount{0};
+    size_t operator()(const Trace &L) const { return hash_combine(L.tie()); }
   };
 
   struct TakenBranchInfo {
@@ -126,8 +130,11 @@ class DataAggregator : public DataReader {
 
   /// Intermediate storage for profile data. We save the results of parsing
   /// and use them later for processing and assigning profile.
-  std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs;
-  std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
+  std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
+  std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
+  /// Pre-populated addresses of returns, coming from pre-aggregated data or
+  /// disassembly. Used to disambiguate call-continuation fall-throughs.
+  std::unordered_set<uint64_t> Returns;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -200,8 +207,8 @@ class DataAggregator : public DataReader {
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
-                         const LBREntry &Second, uint64_t Count = 1) const;
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
+                         bool IsReturn) const;
 
   /// Record external entry into the function \p BF.
   ///
@@ -261,12 +268,14 @@ class DataAggregator : public DataReader {
                      uint64_t From, uint64_t To, uint64_t Count,
                      uint64_t Mispreds);
 
+  /// Checks if \p Addr corresponds to a return instruction.
+  bool checkReturn(uint64_t Addr);
+
   /// Register a \p Branch.
   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
-  bool doTrace(const LBREntry &First, const LBREntry &Second,
-               uint64_t Count = 1);
+  bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn);
 
   /// Parser helpers
   /// Return false if we exhausted our parser buffer and finished parsing
@@ -516,6 +525,21 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   OS << formatv("{0:x} -> {1:x}/{2}", L.From, L.To, L.Mispred ? 'M' : 'P');
   return OS;
 }
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const DataAggregator::Trace &T) {
+  switch (T.Branch) {
+  case DataAggregator::Trace::FT_ONLY:
+  case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
+    break;
+  default:
+    OS << Twine::utohexstr(T.Branch) << " -> ";
+  }
+  OS << Twine::utohexstr(T.From);
+  if (T.To != DataAggregator::Trace::BR_ONLY)
+    OS << " ... " << Twine::utohexstr(T.To);
+  return OS;
+}
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index ade8478f556e9..178c9d3a63730 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -523,6 +523,10 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
   deleteTempFiles();
 
 heatmap:
+  // Sort parsed traces for faster processing.
+  if (!opts::BasicAggregation)
+    llvm::sort(Traces, llvm::less_first());
+
   if (!opts::HeatmapMode)
     return Error::success();
 
@@ -598,8 +602,7 @@ void DataAggregator::processProfile(BinaryContext &BC) {
     llvm::stable_sort(MemEvents.second.Data);
 
   // Release intermediate storage.
-  clear(BranchLBRs);
-  clear(FallthroughLBRs);
+  clear(Traces);
   clear(BasicSamples);
   clear(MemSamples);
 }
@@ -727,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
   return true;
 }
 
+bool DataAggregator::checkReturn(uint64_t Addr) {
+  auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
+  if (llvm::is_contained(Returns, Addr))
+    return true;
+
+  BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
+  if (!Func)
+    return false;
+
+  const uint64_t Offset = Addr - Func->getAddress();
+  if (Func->hasInstructions()
+          ? isReturn(Func->getInstructionAtOffset(Offset))
+          : isReturn(Func->disassembleInstructionAtOffset(Offset))) {
+    Returns.emplace(Addr);
+    return true;
+  }
+  return false;
+}
+
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
                               uint64_t Mispreds) {
-  // Returns whether \p Offset in \p Func contains a return instruction.
-  auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
-    auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
-    return Func.hasInstructions()
-               ? isReturn(Func.getInstructionAtOffset(Offset))
-               : isReturn(Func.disassembleInstructionAtOffset(Offset));
-  };
-
   // Mutates \p Addr to an offset into the containing function, performing BAT
   // offset translation and parent lookup.
   //
-  // Returns the containing function (or BAT parent) and whether the address
-  // corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
+  // Returns the containing function (or BAT parent).
   auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
     BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
     if (!Func) {
       Addr = 0;
-      return std::pair{Func, false};
+      return Func;
     }
 
     Addr -= Func->getAddress();
 
-    bool IsRet = IsFrom && checkReturn(*Func, Addr);
-
     if (BAT)
       Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
 
     if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
-      Func = ParentFunc;
+      return ParentFunc;
 
-    return std::pair{Func, IsRet};
+    return Func;
   };
 
-  auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
-  auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
+  BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true);
+  BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false);
   if (!FromFunc && !ToFunc)
     return false;
 
-  // Ignore returns.
-  if (IsReturn)
-    return true;
-
   // Treat recursive control transfers as inter-branches.
   if (FromFunc == ToFunc && To != 0) {
     recordBranch(*FromFunc, From, To, Count, Mispreds);
@@ -780,37 +787,20 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
 
-bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
-                             uint64_t Count) {
-  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(First.To);
-  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
+bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
+                             bool IsReturn) {
+  const uint64_t From = Trace.From, To = Trace.To;
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
+  NumTraces += Count;
   if (!FromFunc || !ToFunc) {
-    LLVM_DEBUG({
-      dbgs() << "Out of range trace starting in ";
-      if (FromFunc)
-        dbgs() << formatv("{0} @ {1:x}", *FromFunc,
-                          First.To - FromFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(First.To);
-      dbgs() << " and ending in ";
-      if (ToFunc)
-        dbgs() << formatv("{0} @ {1:x}", *ToFunc,
-                          Second.From - ToFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(Second.From);
-      dbgs() << '\n';
-    });
+    LLVM_DEBUG(dbgs() << "Out of range trace " << Trace << '\n');
     NumLongRangeTraces += Count;
     return false;
   }
   if (FromFunc != ToFunc) {
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
-    LLVM_DEBUG({
-      dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-             << formatv(" @ {0:x}", First.To - FromFunc->getAddress())
-             << " and ending in " << ToFunc->getPrintName()
-             << formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
-    });
     return false;
   }
 
@@ -818,28 +808,21 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
   BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
   if (!ParentFunc)
     ParentFunc = FromFunc;
-  ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
+  ParentFunc->SampleCountInBytes += Count * (To - From);
 
   const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
-          ? BAT->getFallthroughsInTrace(FuncAddress, First.To, Second.From)
-          : getFallthroughsInTrace(*FromFunc, First, Second, Count);
+          ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
+          : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
   if (!FTs) {
-    LLVM_DEBUG(
-        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
-               << " and ending in " << ToFunc->getPrintName() << " @ "
-               << ToFunc->getPrintName() << " @ "
-               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
     return false;
   }
 
   LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
-                    << FromFunc->getPrintName() << ":"
-                    << Twine::utohexstr(First.To) << " to "
-                    << Twine::utohexstr(Second.From) << ".\n");
+                    << FromFunc->getPrintName() << ":" << Trace << '\n');
   for (auto [From, To] : *FTs) {
     if (BAT) {
       From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
@@ -852,17 +835,15 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
 }
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
-                                       const LBREntry &FirstLBR,
-                                       const LBREntry &SecondLBR,
-                                       uint64_t Count) const {
+DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
+                                       uint64_t Count, bool IsReturn) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
 
   // Offsets of the trace within this function.
-  const uint64_t From = FirstLBR.To - BF.getAddress();
-  const uint64_t To = SecondLBR.From - BF.getAddress();
+  const uint64_t From = Trace.From - BF.getAddress();
+  const uint64_t To = Trace.To - BF.getAddress();
 
   if (From > To)
     return std::nullopt;
@@ -889,8 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (From == FromBB->getOffset() && !BF.containsAddress(FirstLBR.From) &&
-      !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+  if (IsReturn) {
+    if (From)
+      FromBB = BF.getBasicBlockContainingOffset(From - 1);
+    else
+      LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
+  } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
+             !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -898,10 +884,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
       if (Instr && BC.MIB->isCall(*Instr))
         FromBB = PrevBB;
       else
-        LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR
-                          << '\n');
+        LLVM_DEBUG(dbgs() << "invalid trace (no call): " << Trace << '\n');
     } else {
-      LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
+      LLVM_DEBUG(dbgs() << "invalid trace: " << Trace << '\n');
     }
   }
 
@@ -920,9 +905,7 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
     // Check for bad LBRs.
     if (!BB->getSuccessor(NextBB->getLabel())) {
-      LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
-                        << "  " << FirstLBR << '\n'
-                        << "  " << SecondLBR << '\n');
+      LLVM_DEBUG(dbgs() << "no fall-through for the trace: " << Trace << '\n');
       return std::nullopt;
     }
 
@@ -1227,14 +1210,15 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     FT_EXTERNAL_ORIGIN // f
   } Type = INVALID;
 
-  // The number of fields to parse, set based on Type.
+  /// The number of fields to parse, set based on \p Type.
   int AddrNum = 0;
   int CounterNum = 0;
-  // Storage for parsed fields.
+  /// Storage for parsed fields.
   StringRef EventName;
   std::optional<Location> Addr[3];
   int64_t Counters[2] = {0};
 
+  /// Parse strings: record type and optionally an event name.
   while (Type == INVALID || Type == EVENT_NAME) {
     while (checkAndConsumeFS()) {
     }
@@ -1268,6 +1252,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
+  /// Parse locations depending on entry type, recording them in \p Addr array.
   for (int I = 0; I < AddrNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1277,6 +1262,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Addr[I] = AddrOrErr.get();
   }
 
+  /// Parse counters depending on entry type.
   for (int I = 0; I < CounterNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1287,11 +1273,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Counters[I] = CountOrErr.get();
   }
 
+  /// Expect end of line here.
   if (!checkAndConsumeNewLine()) {
     reportError("expected end of line");
     return make_error_code(llvm::errc::io_error);
   }
 
+  /// Record event name into \p EventNames and return.
   if (Type == EVENT_NAME) {
     EventNames.insert(EventName);
     return std::error_code();
@@ -1305,6 +1293,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   int64_t Count = Counters[0];
   int64_t Mispreds = Counters[1];
 
+  /// Record basic IP sample into \p BasicSamples and return.
   if (Type == SAMPLE) {
     BasicSamples[FromOffset] += Count;
     NumTotalSamples += Count;
@@ -1316,30 +1305,26 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   if (ToFunc)
     ToFunc->setHasProfileAvailable();
 
-  Trace Trace(FromOffset, ToOffset);
-  // Taken trace
-  if (Type == TRACE || Type == BRANCH) {
-    TakenBranchInfo &Info = BranchLBRs[Trace];
-    Info.TakenCount += Count;
-    Info.MispredCount += Mispreds;
-
-    NumTotalSamples += Count;
+  /// For legacy fall-through types, adjust locations to match Trace container.
+  if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
+    Addr[2] = Location(Addr[1]->Offset); // Trace To
+    Addr[1] = Location(Addr[0]->Offset); // Trace From
+    // Put a magic value into Trace Branch to differentiate from a full trace.
+    Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
   }
-  // Construct fallthrough part of the trace
-  if (Type == TRACE) {
-    const uint64_t TraceFtEndOffset = Addr[2]->Offset;
-    Trace.From = ToOffset;
-    Trace.To = TraceFtEndOffset;
-    Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN;
-  }
-  // Add fallthrough trace
-  if (Type != BRANCH) {
-    FTInfo &Info = FallthroughLBRs[Trace];
-    (Type == FT ? Info.InternCount : Info.ExternCount) += Count;
 
-    NumTraces += Count;
+  /// For legacy branch type, mark Trace To to differentite from a full trace.
+  if (Type == BRANCH) {
+    Addr[2] = Location(Trace::BR_ONLY);
   }
 
+  /// Record a trace.
+  Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset};
+  TakenBranchInfo TI{(uint64_t)Count, (uint64_t)Mispreds};
+  Traces.emplace_back(T, TI);
+
+  NumTotalSamples += Count;
+
   return std::error_code();
 }
 
@@ -1350,7 +1335,7 @@ bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
 
 std::error_code DataAggregator::printLBRHeatMap() {
   outs() << "PERF2BOLT: parse branch events...\n";
-  NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
+  NamedRegionTimer T("buildHeatmap", "Building heatmap", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
 
   if (BC->IsLinuxKernel) {
@@ -1386,12 +1371,9 @@ std::error_code DataAggregator::printLBRHeatMap() {
   // Register basic samples and perf LBR addresses not covered by fallthroughs.
   for (const auto &[PC, Hits] : BasicSamples)
     HM.registerAddress(PC, Hits);
-  for (const auto &LBR : FallthroughLBRs) {
-    const Trace &Trace = LBR.first;
-    const FTInfo &Info = LBR.second;
-    HM.registerAddressRange(Trace.From, Trace.To,
-                            Info.InternCount + Info.ExternCount);
-  }
+  for (const auto &[Trace, Info] : Traces)
+    if (Trace.To != Trace::BR_ONLY)
+      HM.registerAddressRange(Trace.From, Trace.To, Info.TakenCount);
 
   if (HM.getNumInvalidRanges())
     outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n';
@@ -1437,22 +1419,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
     // chronological order)
     if (NeedsSkylakeFix && NumEntry <= 2)
       continue;
-    if (NextLBR) {
-      // Record fall-through trace.
-      const uint64_t TraceFrom = LBR.To;
-      const uint64_t TraceTo = NextLBR->From;
-      const BinaryFunction *TraceBF =
-          getBinaryFunctionContainingAddress(TraceFrom);
-      FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
-      if (TraceBF && TraceBF->containsAddress(LBR.From))
-        ++Info.InternCount;
-      else
-        ++Info.ExternCount;
-      ++NumTraces;
-    }
+    uint64_t TraceTo = NextLBR ? NextLBR->From : Trace::BR_ONLY;
     NextLBR = &LBR;
 
-    TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
+    TakenBranchInfo &Info = TraceMap[Trace{LBR.From, LBR.To, TraceTo}];
     ++Info.TakenCount;
     Info.MispredCount += LBR.Mispred;
   }
@@ -1563,10 +1533,14 @@ std::error_code DataAggregator::parseBranchEvents() {
     parseLBRSample(Sample, NeedsSkylakeFix);
   }
 
-  for (const Trace &Trace : llvm::make_first_range(BranchLBRs))
-    for (const uint64_t Addr : {Trace.From, Trace.To})
+  Traces.reserve(TraceMap.size());
+  for (const auto &[Trace, Info] : TraceMap) {
+    Traces.emplace_back(Trace, Info);
+    for (const uint64_t Addr : {Trace.Branch, Trace.From})
       if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
         BF->setHasProfileAvailable();
+  }
+  clear(TraceMap);
 
   outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries
          << " LBR entries\n";
@@ -1591,23 +1565,14 @@ void DataAggregator::processBranchEvents() {
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
-  for (const auto &AggrLBR : FallthroughLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const FTInfo &Info = AggrLBR.second;
-    LBREntry First{Loc.From, Loc.From, false};
-    LBREntry Second{Loc.To, Loc.To, false};
-    if (Info.InternCount)
-      doTrace(First, Second, Info.InternCount);
-    if (Info.ExternCount) {
-      First.From = 0;
-      doTrace(First, Second, Info.ExternCount);
-    }
-  }
-
-  for (const auto &AggrLBR : BranchLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const TakenBranchInfo &Info = AggrLBR.second;
-    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
+  for (const auto &[Trace, Info] : Traces) {
+    bool IsReturn = checkReturn(Trace.Branch);
+    // Ignore returns.
+    if (!IsReturn && Trace.Branch != Trace::FT_ONLY &&
+        Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
+      doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
+    if (Trace.To != Trace::BR_ONLY)
+      doTrace(Trace, Info.TakenCount, IsReturn);
   }
   printBranchSamplesDiagnostics();
 }
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 4994cfb541eef..c2ef024db9475 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,29 +4,43 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pat PREAGGT1
-# RUN: link_fdata %s %t %t.pat2 PREAGGT2
-# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT
+# Trace to a call continuation, not a landing pad/entry point
+# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE
+# Trace from a return to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
+# Trace from an external location to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
 
 ## Check pre-aggregated traces attach call continuation fallthrough count
-# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s
-
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to secondary entry point (unstripped)
-# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to landing pad (stripped, LP)
-# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+## in the basic case (not an entry point, not a landing pad).
+# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE
+
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
 
 ## Check pre-aggregated traces don't report zero-sized PLT fall-through as
 ## invalid trace
-# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \
+# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
@@ -56,11 +70,11 @@ main:
 Ltmp0_br:
 	callq	puts@PLT
 ## Check PLT traces are accepted
-# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
+# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
 ## Target is an external-origin call continuation
-# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
-# CHECK:      callq puts@PLT
-# CHECK-NEXT: count: 2
+# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2
+# CHECK-BASE:      callq puts@PLT
+# CHECK-BASE-NEXT: count: 2
 
 Ltmp1:
 	movq	-0x10(%rbp), %rax
@@ -71,24 +85,18 @@ Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
 Ltmp4_br:
 	je	Ltmp0
-# CHECK2:      je .Ltmp0
-# CHECK2-NEXT: count: 3
 
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
 ## Target is a binary-local call continuation
-# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
-# CHECK:      callq foo
-# CHECK-NEXT: count: 1
-
-## PLT call continuation fallthrough spanning the call
-# CHECK2:      callq foo
-# CHECK2-NEXT: count: 3
-
+# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
-# CHECK3:      callq foo
-# CHECK3-NEXT: count: 0
+# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+
+# CHECK-ATTACH:      callq foo
+# CHECK-ATTACH-NEXT: count: 1
+# CHECK-SKIP:        callq foo
+# CHECK-SKIP-NEXT:   count: 0
 
 Ltmp3:
 	cmpl	$0x0, -0x18(%rbp)
diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index d5a6849b27a77..8a61d11f5825f 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,6 +1,11 @@
-host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
+host_triple = config.target_triple
+
+# Force triple on non-linux hosts to get ELF binaries on all platforms.
+if not "linux" in host_triple:
+  host_triple = host_triple.split("-")[0] + "-unknown-linux-gnu"
+
 common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
-flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
+flags = f"--target={host_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
 config.substitutions.insert(0, ("%cxxflags", f"%cxxflags {flags}"))
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 820d644ef8b83..71a926f1c73e0 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -147,6 +147,7 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 bool CommentInfo::operator==(const CommentInfo &Other) const {
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index e8f1a9cee2675..820e8bfd8e644 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -392,6 +392,7 @@ std::string serialize(std::unique_ptr<Info> &I) {
   case InfoType::IT_default:
     return "";
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 static void parseFullComment(const FullComment *C, CommentInfo &CI) {
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index a877f9a7ee912..d89c3a69fc841 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -50,7 +50,8 @@ UnnecessaryValueParamCheck::UnnecessaryValueParamCheck(
                                         utils::IncludeSorter::IS_LLVM),
                areDiagsSelfContained()),
       AllowedTypes(
-          utils::options::parseStringList(Options.get("AllowedTypes", ""))) {}
+          utils::options::parseStringList(Options.get("AllowedTypes", ""))),
+      IgnoreCoroutines(Options.get("IgnoreCoroutines", true)) {}
 
 void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
   const auto ExpensiveValueParamDecl = parmVarDecl(
@@ -61,12 +62,14 @@ void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
                            matchers::matchesAnyListedName(AllowedTypes))))))),
       decl().bind("param"));
   Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          functionDecl(hasBody(stmt()), isDefinition(), unless(isImplicit()),
-                       unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
-                       has(typeLoc(forEach(ExpensiveValueParamDecl))),
-                       decl().bind("functionDecl"))),
+      traverse(TK_AsIs,
+               functionDecl(
+                   hasBody(IgnoreCoroutines ? stmt(unless(coroutineBodyStmt()))
+                                            : stmt()),
+                   isDefinition(), unless(isImplicit()),
+                   unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
+                   has(typeLoc(forEach(ExpensiveValueParamDecl))),
+                   decl().bind("functionDecl"))),
       this);
 }
 
@@ -123,6 +126,7 @@ void UnnecessaryValueParamCheck::storeOptions(
   Options.store(Opts, "IncludeStyle", Inserter.getStyle());
   Options.store(Opts, "AllowedTypes",
                 utils::options::serializeStringList(AllowedTypes));
+  Options.store(Opts, "IgnoreCoroutines", IgnoreCoroutines);
 }
 
 void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 8bfd814d16357..b52043416e769 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -46,6 +46,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
   ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
+  bool IgnoreCoroutines;
 };
 
 } // namespace clang::tidy::performance
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 29321f7cd3fa2..a703009e2b467 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -494,9 +494,9 @@ static std::vector<llvm::StringRef> semanticTokenModifiers() {
 void ClangdLSPServer::onInitialize(const InitializeParams &Params,
                                    Callback<llvm::json::Value> Reply) {
   // Determine character encoding first as it affects constructed ClangdServer.
-  if (Params.capabilities.offsetEncoding && !Opts.Encoding) {
+  if (Params.capabilities.PositionEncodings && !Opts.Encoding) {
     Opts.Encoding = OffsetEncoding::UTF16; // fallback
-    for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding)
+    for (OffsetEncoding Supported : *Params.capabilities.PositionEncodings)
       if (Supported != OffsetEncoding::UnsupportedEncoding) {
         Opts.Encoding = Supported;
         break;
@@ -686,6 +686,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
   ServerCaps["executeCommandProvider"] =
       llvm::json::Object{{"commands", Commands}};
 
+  if (Opts.Encoding)
+    ServerCaps["positionEncoding"] = *Opts.Encoding;
+
   llvm::json::Object Result{
       {{"serverInfo",
         llvm::json::Object{
@@ -693,6 +696,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
             {"version", llvm::formatv("{0} {1} {2}", versionString(),
                                       featureString(), platformString())}}},
        {"capabilities", std::move(ServerCaps)}}};
+
+  // TODO: offsetEncoding capability is a deprecated clangd extension and should
+  // be deleted.
   if (Opts.Encoding)
     Result["offsetEncoding"] = *Opts.Encoding;
   Reply(std::move(Result));
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index c9e8a175b5d76..2c858e28fa243 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -497,10 +497,19 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       if (auto Cancel = StaleRequestSupport->getBoolean("cancel"))
         R.CancelsStaleRequests = *Cancel;
     }
+    if (auto *PositionEncodings = General->get("positionEncodings")) {
+      R.PositionEncodings.emplace();
+      if (!fromJSON(*PositionEncodings, *R.PositionEncodings,
+                    P.field("general").field("positionEncodings")))
+        return false;
+    }
   }
   if (auto *OffsetEncoding = O->get("offsetEncoding")) {
-    R.offsetEncoding.emplace();
-    if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+    R.PositionEncodings.emplace();
+    elog("offsetEncoding capability is a deprecated clangd extension that'll "
+         "go away with clangd 23. Migrate to standard positionEncodings "
+         "capability introduced by LSP 3.17");
+    if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                   P.field("offsetEncoding")))
       return false;
   }
@@ -536,8 +545,11 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       }
     }
     if (auto *OffsetEncoding = Experimental->get("offsetEncoding")) {
-      R.offsetEncoding.emplace();
-      if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+      R.PositionEncodings.emplace();
+      elog("offsetEncoding capability is a deprecated clangd extension that'll "
+           "go away with clangd 23. Migrate to standard positionEncodings "
+           "capability introduced by LSP 3.17");
+      if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                     P.field("offsetEncoding")))
         return false;
     }
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 8a7809d6677ee..3a6bf155ee153 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -528,8 +528,9 @@ struct ClientCapabilities {
   /// textDocument.semanticHighlightingCapabilities.semanticHighlighting
   bool TheiaSemanticHighlighting = false;
 
-  /// Supported encodings for LSP character offsets. (clangd extension).
-  std::optional<std::vector<OffsetEncoding>> offsetEncoding;
+  /// Supported encodings for LSP character offsets.
+  /// general.positionEncodings
+  std::optional<std::vector<OffsetEncoding>> PositionEncodings;
 
   /// The content format that should be used for Hover requests.
   /// textDocument.hover.contentEncoding
diff --git a/clang-tools-extra/clangd/test/positionencoding.test b/clang-tools-extra/clangd/test/positionencoding.test
new file mode 100644
index 0000000000000..eea7a1a596e9a
--- /dev/null
+++ b/clang-tools-extra/clangd/test/positionencoding.test
@@ -0,0 +1,32 @@
+# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s
+# This test verifies that we can negotiate UTF-8 offsets via the positionEncodings capability introduced in LSP 3.17.
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"general":{"positionEncodings":["utf-8","utf-16"]}},"trace":"off"}}
+# CHECK: "positionEncoding": "utf-8"
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ö*/int x;\nint y=x;"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}}
+# /*ö*/int x;
+# 01234567890
+# x is character (and utf-16) range [9,10) but byte range [10,11).
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": [
+# CHECK-NEXT:    {
+# CHECK-NEXT:      "range": {
+# CHECK-NEXT:        "end": {
+# CHECK-NEXT:          "character": 11,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        },
+# CHECK-NEXT:        "start": {
+# CHECK-NEXT:          "character": 10,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        }
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "uri": "file://{{.*}}/main.cpp"
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
+---
+{"jsonrpc":"2.0","id":10000,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19ccd1790e757..3c1ca2f929044 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -265,6 +265,8 @@ Changes in existing checks
   <clang-tidy/checks/performance/unnecessary-value-param>` check performance by
   tolerating fix-it breaking compilation when functions is used as pointers
   to avoid matching usage of functions within the current compilation unit.
+  Added an option `IgnoreCoroutines` with the default value `true` to
+  suppress this check for coroutines where passing by reference may be unsafe.
 
 - Improved :doc:`readability-convert-member-functions-to-static
   <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
index f72b8c7eabc22..b7631139a0133 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
@@ -34,7 +34,7 @@ dependent).
 .. code-block:: c++
 
     // AFTER
-    enum Color : std:int8_t {
+    enum Color : std::int8_t {
         RED = -1,
         GREEN = 0,
         BLUE = 1
diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
index dc86530b95f13..cd25d7d94d99b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
@@ -56,7 +56,7 @@ Will become:
 
 Because the fix-it needs to change the signature of the function, it may break
 builds if the function is used in multiple translation units or some codes
-depends on funcion signatures.
+depends on function signatures.
 
 Options
 -------
@@ -74,3 +74,10 @@ Options
    default is empty. If a name in the list contains the sequence `::`, it is
    matched against the qualified type name (i.e. ``namespace::Type``),
    otherwise it is matched against only the type name (i.e. ``Type``).
+
+.. option:: IgnoreCoroutines
+
+   A boolean specifying whether the check should suggest passing parameters by
+   reference in coroutines. Passing parameters by reference in coroutines may
+   not be safe, please see :doc:`cppcoreguidelines-avoid-reference-coroutine-parameters <../cppcoreguidelines/avoid-reference-coroutine-parameters>`
+   for more information. Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
new file mode 100644
index 0000000000000..0a84dc4676470
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
@@ -0,0 +1,65 @@
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- -fix-errors
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: true}}' -fix-errors
+// RUN: %check_clang_tidy -check-suffix=ALLOWED -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: false}}' -fix-errors
+
+namespace std {
+
+template <class Ret, typename... T> struct coroutine_traits {
+  using promise_type = typename Ret::promise_type;
+};
+
+template <class Promise = void> struct coroutine_handle {
+  static coroutine_handle from_address(void *) noexcept;
+  static coroutine_handle from_promise(Promise &promise);
+  constexpr void *address() const noexcept;
+};
+
+template <> struct coroutine_handle<void> {
+  template <class PromiseType>
+  coroutine_handle(coroutine_handle<PromiseType>) noexcept;
+  static coroutine_handle from_address(void *);
+  constexpr void *address() const noexcept;
+};
+
+struct suspend_always {
+  bool await_ready() noexcept { return false; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+struct suspend_never {
+  bool await_ready() noexcept { return true; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+} // namespace std
+
+struct ReturnObject {
+    struct promise_type {
+        ReturnObject get_return_object() { return {}; }
+        ReturnObject return_void() { return {}; }
+        std::suspend_always initial_suspend() { return {}; }
+        std::suspend_always final_suspend() noexcept { return {}; }
+        void unhandled_exception() {}
+        std::suspend_always yield_value(int value) { return {}; }
+    };
+};
+
+struct A {
+  A(const A&);
+};
+
+ReturnObject foo_coroutine(const A a) {
+// CHECK-MESSAGES-ALLOWED: [[@LINE-1]]:36: warning: the const qualified parameter 'a'
+// CHECK-FIXES: ReturnObject foo_coroutine(const A a) {
+  co_return;
+}
+
+ReturnObject foo_not_coroutine(const A a) {
+// CHECK-MESSAGES: [[@LINE-1]]:40: warning: the const qualified parameter 'a'
+// CHECK-MESSAGES-ALLOWED: [[@LINE-2]]:40: warning: the const qualified parameter 'a'
+  return ReturnObject{};
+}
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 33ee8a53b5f37..12816eed2e8b5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -65,8 +65,10 @@ C++ Specific Potentially Breaking Changes
   standard library already have their own bespoke builtins.
 - A workaround for libstdc++4.7 has been removed. Note that 4.8.3 remains the oldest
   supported libstdc++ version.
-
 - Added ``!nonnull/!align`` metadata to load of references for better codegen.
+- Checking for int->enum conversions in constant expressions is more strict;
+  in particular, ``const E x = (E)-1;`` is not treated as a constant if it's
+  out of range. This impacts old versions of Boost.  (#GH143034)
 
 ABI Changes in This Version
 ---------------------------
@@ -339,6 +341,8 @@ New Compiler Flags
 
 - New option ``-Wnrvo`` added and disabled by default to warn about missed NRVO opportunities.
 
+- New option ``-ignore-pch`` added to disable precompiled headers. It overrides ``-emit-pch`` and ``-include-pch``. (#GH142409, `PCHDocs <https://clang.llvm.org/docs/UsersManual.html#ignoring-a-pch-file>`_).
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -357,6 +361,8 @@ Modified Compiler Flags
 
 - The ``-fchar8_t`` flag is no longer considered in non-C++ languages modes. (#GH55373)
 
+- The ``-fveclib=libmvec`` option now supports AArch64 targets (requires GLIBC 2.40 or newer).
+
 Removed Compiler Flags
 -------------------------
 
@@ -688,6 +694,8 @@ Bug Fixes in This Version
   ``#include`` directive. (#GH138094)
 - Fixed a crash during constant evaluation involving invalid lambda captures
   (#GH138832)
+- Fixed compound literal is not constant expression inside initializer list
+  (#GH87867)
 - Fixed a crash when instantiating an invalid dependent friend template specialization.
   (#GH139052)
 - Fixed a crash with an invalid member function parameter list with a default
@@ -704,6 +712,7 @@ Bug Fixes in This Version
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 - Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
 - Fixed an infinite recursion when checking constexpr destructors. (#GH141789)
+- Fixed a crash when a malformed using declaration appears in a ``constexpr`` function. (#GH144264)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index 958199eb7a2e2..d56b21d74c7e3 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,3 +347,8 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
+
+GCC Installation
+=================
+Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
+using ``-gcc-install-dir`` flag.
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 62844f7e6a2fa..284a404026dfe 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1458,6 +1458,19 @@ will be processed from the PCH file. Otherwise, Clang will report an error.
   ``test.h`` since ``test.h`` was included directly in the source file and not
   specified on the command line using ``-include-pch``.
 
+Ignoring a PCH File
+^^^^^^^^^^^^^^^^^^^
+
+To ignore PCH options, a `-ignore-pch` option is passed to ``clang``:
+
+.. code-block:: console
+
+  $ clang -x c-header test.h -Xclang -ignore-pch -o test.h.pch
+  $ clang -include-pch test.h.pch -Xclang -ignore-pch test.c -o test
+
+This option disables precompiled headers, overrides -emit-pch and -include-pch.
+test.h.pch is not generated and not used as a prefix header.
+
 Relocatable PCH Files
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 3abb49312255a..e01361e2466b5 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -629,10 +629,48 @@ class ASTContext : public RefCountedBase<ASTContext> {
   void setRelocationInfoForCXXRecord(const CXXRecordDecl *,
                                      CXXRecordDeclRelocationInfo);
 
+  /// Examines a given type, and returns whether the type itself
+  /// is address discriminated, or any transitively embedded types
+  /// contain data that is address discriminated. This includes
+  /// implicitly authenticated values like vtable pointers, as well as
+  /// explicitly qualified fields.
+  bool containsAddressDiscriminatedPointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) != PointerAuthContent::None;
+  }
+
+  /// Examines a given type, and returns whether the type itself
+  /// or any data it transitively contains has a pointer authentication
+  /// schema that is not safely relocatable. e.g. any data or fields
+  /// with address discrimination other than any otherwise similar
+  /// vtable pointers.
+  bool containsNonRelocatablePointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) ==
+           PointerAuthContent::AddressDiscriminatedData;
+  }
+
 private:
   llvm::DenseMap<const CXXRecordDecl *, CXXRecordDeclRelocationInfo>
       RelocatableClasses;
 
+  // FIXME: store in RecordDeclBitfields in future?
+  enum class PointerAuthContent : uint8_t {
+    None,
+    AddressDiscriminatedVTable,
+    AddressDiscriminatedData
+  };
+
+  // A simple helper function to short circuit pointer auth checks.
+  bool isPointerAuthenticationAvailable() const {
+    return LangOpts.PointerAuthCalls || LangOpts.PointerAuthIntrinsics;
+  }
+  PointerAuthContent findPointerAuthContent(QualType T);
+  llvm::DenseMap<const RecordDecl *, PointerAuthContent>
+      RecordContainsAddressDiscriminatedPointerAuth;
+
   ImportDecl *FirstLocalImport = nullptr;
   ImportDecl *LastLocalImport = nullptr;
 
@@ -3668,6 +3706,7 @@ OPT_LIST(V)
   /// authentication policy for the specified record.
   const CXXRecordDecl *
   baseForVTableAuthentication(const CXXRecordDecl *ThisClass);
+
   bool useAbbreviatedThunkName(GlobalDecl VirtualMethodDecl,
                                StringRef MangledName);
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 047f51ffa59ed..6051e1fc45111 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2934,7 +2934,7 @@ https://gcc.gnu.org/onlinedocs/gcc/RISC-V-Function-Attributes.html
 https://riscv.org/specifications/privileged-isa/
 The RISC-V Instruction Set Manual Volume II: Privileged Architecture
 Version 1.10.
-https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.7
+https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0
 https://sifive.cdn.prismic.io/sifive/d1984d2b-c9b9-4c91-8de0-d68a5e64fa0f_sifive-interrupt-cookbook-v1p2.pdf
   }];
 }
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2a30ff11464dd..e5566a540dc65 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -483,8 +483,10 @@ CODEGENOPT(StaticClosure, 1, 0)
 /// Assume that UAVs/SRVs may alias
 CODEGENOPT(ResMayAlias, 1, 0)
 
-/// Enables unwind v2 (epilog) information for x64 Windows.
-CODEGENOPT(WinX64EHUnwindV2, 1, 0)
+/// Controls how unwind v2 (epilog) information should be generated for x64
+/// Windows.
+ENUM_CODEGENOPT(WinX64EHUnwindV2, llvm::WinX64EHUnwindV2Mode,
+                2, llvm::WinX64EHUnwindV2Mode::Disabled)
 
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29f6480ba935c..94224e1038758 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,6 +847,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
+def warn_drv_multilib_not_available_for_target: Warning<
+  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
+  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 38b4f581fa5c9..36fa3227fd6a6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -815,19 +815,22 @@ changes to one object won't affect the others, the object's initializer will run
 once per copy, etc.
 
 Specifically, this warning fires when it detects an object which:
-1. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
-2. Has external linkage (otherwise it's supposed to be duplicated), and
-3. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
+
+#. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
+#. Has external linkage (otherwise it's supposed to be duplicated), and
+#. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
 
 As well as one of the following:
-1. The object is mutable, or
-2. The object's initializer definitely has side effects.
+
+#. The object is mutable, or
+#. The object's initializer definitely has side effects.
 
 The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
-1. Marking the object ``const`` (if possible)
-2. Moving the object's definition to a source file
-3. Making the object visible using ``__attribute((visibility("default")))``,
+
+#. Marking the object ``const`` (if possible)
+#. Moving the object's definition to a source file
+#. Making the object visible using ``__attribute((visibility("default")))``,
    ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
 
 When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bd36d228578b7..8dd1f0ce361d7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2155,6 +2155,8 @@ def VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
     `(` $kind `,` $lhs `,` $rhs `)` `:` qualified(type($lhs)) `,`
     qualified(type($result)) attr-dict
   }];
+
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 13ddc77835fbc..3dc28e6f2e5bf 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -236,6 +236,7 @@ struct MissingFeatures {
   static bool runCleanupsScope() { return false; }
   static bool lowerAggregateLoadStore() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
+  static bool asmLabelAttr() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b07deb4a8482..0ffd8c40da7da 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2167,11 +2167,14 @@ defm assume_nothrow_exception_dtor: BoolFOption<"assume-nothrow-exception-dtor",
   LangOpts<"AssumeNothrowExceptionDtor">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Assume that exception objects' destructors are non-throwing">,
   NegFlag<SetFalse>>;
-defm winx64_eh_unwindv2 : BoolFOption<"winx64-eh-unwindv2",
-  CodeGenOpts<"WinX64EHUnwindV2">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
-  NegFlag<SetFalse, [], [ClangOption], "Disable">,
-  BothFlags<[], [ClangOption], " unwind v2 (epilog) information for x64 Windows">>;
+def winx64_eh_unwindv2
+    : Joined<["-"], "fwinx64-eh-unwindv2=">, Group<f_Group>,
+    Visibility<[ClangOption, CC1Option]>,
+      HelpText<"Generate unwind v2 (epilog) information for x64 Windows">,
+      Values<"disabled,best-effort,required">,
+      NormalizedValues<["Disabled", "BestEffort", "Required"]>,
+      NormalizedValuesScope<"llvm::WinX64EHUnwindV2Mode">,
+      MarshallingInfoEnum<CodeGenOpts<"WinX64EHUnwindV2">, "Disabled">;
 def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">, Group<f_Group>,
   Visibility<[ClangOption, CLOption]>,
   HelpText<"Allows control over excess precision on targets where native "
@@ -3348,6 +3351,9 @@ defm pch_codegen: OptInCC1FFlag<"pch-codegen", "Generate ", "Do not generate ",
   "code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
 defm pch_debuginfo: OptInCC1FFlag<"pch-debuginfo", "Generate ", "Do not generate ",
   "debug info for types in an object file built from this PCH and do not generate them elsewhere">;
+def ignore_pch : Flag<["-"], "ignore-pch">, Group<f_Group>,
+  Visibility<[ClangOption]>,
+  HelpText<"Disable precompiled headers, overrides -emit-pch and -include-pch">;
 
 def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
@@ -3473,8 +3479,9 @@ def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
     HelpText<"Use the given vector functions library">,
     HelpTextForVariants<[ClangOption, CC1Option],
-      "Use the given vector functions library. "
-      "Note: -fveclib={ArmPL,SLEEF} implies -fno-math-errno">,
+      "Use the given vector functions library.\n"
+      "  Note: -fveclib={ArmPL,SLEEF,libmvec} implies -fno-math-errno.\n"
+      "  Note: -fveclib=libmvec on AArch64 requires GLIBC 2.40 or newer.">,
     Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,AMDLIBM,none">,
     NormalizedValuesScope<"llvm::driver::VectorLibrary">,
     NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF",
@@ -5147,10 +5154,16 @@ def mno_fix_cortex_a72_aes_1655431 : Flag<["-"], "mno-fix-cortex-a72-aes-1655431
   Alias<mno_fix_cortex_a57_aes_1742098>;
 def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Work around Cortex-A53 erratum 835769 (AArch64 only)">;
 def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Don't work around Cortex-A53 erratum 835769 (AArch64 only)">;
+def mfix_cortex_a53_843419 : Flag<["-"], "mfix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Work around Cortex-A53 erratum 843419 (AArch64 only)">;
+def mno_fix_cortex_a53_843419 : Flag<["-"], "mno-fix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Don't work around Cortex-A53 erratum 843419 (AArch64 only)">;
 def mmark_bti_property : Flag<["-"], "mmark-bti-property">,
   Group<m_aarch64_Features_Group>,
   HelpText<"Add .note.gnu.property with BTI to assembly files (AArch64 only)">;
@@ -8972,7 +8985,9 @@ def _SLASH_volatile_Group : OptionGroup<"</volatile group>">,
   Group<cl_compile_Group>;
 
 def _SLASH_d2epilogunwind : CLFlag<"d2epilogunwind">,
-  HelpText<"Enable unwind v2 (epilog) information for x64 Windows">;
+  HelpText<"Best effort generate unwind v2 (epilog) information for x64 Windows">;
+def _SLASH_d2epilogunwindrequirev2 : CLFlag<"d2epilogunwindrequirev2">,
+  HelpText<"Require generation of unwind v2 (epilog) information for x64 Windows">;
 def _SLASH_EH : CLJoined<"EH">, HelpText<"Set exception handling model">;
 def _SLASH_EP : CLFlag<"EP">,
   HelpText<"Disable linemarker output and preprocess to stdout">;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 3243b94c5e5e6..a47e23ffbd357 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3598,7 +3598,7 @@ class Parser : public CodeCompletionHandler {
   /// keyword.
   bool isClassCompatibleKeyword(Token Tok) const;
 
-  void ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs);
+  void ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs);
 
   ///@}
 
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index ad12a3d73413b..07b9e1bc10f5a 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -427,6 +427,17 @@ class Scope {
     return false;
   }
 
+  /// isInObjcMethodScope - Return true if this scope is, or is contained, in an
+  /// C function body.
+  bool isInCFunctionScope() const {
+    for (const Scope *S = this; S; S = S->getParent()) {
+      if (S->isFunctionScope())
+        return true;
+    }
+
+    return false;
+  }
+
   /// isInObjcMethodScope - Return true if this scope is, or is contained in, an
   /// Objective-C method body.  Note that this method is not constant time.
   bool isInObjcMethodScope() const {
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index ba5f06f93dc30..33c4b8d1568bf 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -119,6 +119,19 @@ class SemaHLSL : public SemaBase {
                                        bool IsCompAssign);
   void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc);
 
+  /// Computes the unique Root Signature identifier from the given signature,
+  /// then lookup if there is a previousy created Root Signature decl.
+  ///
+  /// Returns the identifier and if it was found
+  std::pair<IdentifierInfo *, bool>
+  ActOnStartRootSignatureDecl(StringRef Signature);
+
+  /// Creates the Root Signature decl of the parsed Root Signature elements
+  /// onto the AST and push it onto current Scope
+  void ActOnFinishRootSignatureDecl(
+      SourceLocation Loc, IdentifierInfo *DeclIdent,
+      SmallVector<llvm::hlsl::rootsig::RootElement> &Elements);
+
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index cf4ae610ea51f..0f49646f3f022 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -899,6 +899,10 @@ class ASTWriter : public ASTDeserializationListener,
     return WritingModule && WritingModule->isNamedModule();
   }
 
+  bool isWritingStdCXXHeaderUnit() const {
+    return WritingModule && WritingModule->isHeaderUnit();
+  }
+
   bool isGeneratingReducedBMI() const { return GeneratingReducedBMI; }
 
   bool getDoneWritingDeclsAndTypes() const { return DoneWritingDeclsAndTypes; }
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 2a96df80d1001..211ce585fbac8 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -326,39 +326,34 @@ def StdVariantChecker : Checker<"StdVariant">,
 
 let ParentPackage = Nullability in {
 
-def NullabilityBase : Checker<"NullabilityBase">,
-  HelpText<"Stores information during the analysis about nullability.">,
-  Documentation<NotDocumented>,
-  Hidden;
-
-def NullPassedToNonnullChecker : Checker<"NullPassedToNonnull">,
-  HelpText<"Warns when a null pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullPassedToNonnullChecker
+      : Checker<"NullPassedToNonnull">,
+        HelpText<"Warns when a null pointer is passed to a pointer which has a "
+                 "_Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullReturnedFromNonnullChecker : Checker<"NullReturnedFromNonnull">,
-  HelpText<"Warns when a null pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullReturnedFromNonnullChecker
+      : Checker<"NullReturnedFromNonnull">,
+        HelpText<"Warns when a null pointer is returned from a function that "
+                 "has _Nonnull return type.">,
+        Documentation<HasDocumentation>;
 
-def NullableDereferencedChecker : Checker<"NullableDereferenced">,
-  HelpText<"Warns when a nullable pointer is dereferenced.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullableDereferencedChecker
+      : Checker<"NullableDereferenced">,
+        HelpText<"Warns when a nullable pointer is dereferenced.">,
+        Documentation<HasDocumentation>;
 
-def NullablePassedToNonnullChecker : Checker<"NullablePassedToNonnull">,
-  HelpText<"Warns when a nullable pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullablePassedToNonnullChecker
+      : Checker<"NullablePassedToNonnull">,
+        HelpText<"Warns when a nullable pointer is passed to a pointer which "
+                 "has a _Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullableReturnedFromNonnullChecker : Checker<"NullableReturnedFromNonnull">,
-  HelpText<"Warns when a nullable pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<NotDocumented>;
+  def NullableReturnedFromNonnullChecker
+      : Checker<"NullableReturnedFromNonnull">,
+        HelpText<"Warns when a nullable pointer is returned from a function "
+                 "that has _Nonnull return type.">,
+        Documentation<NotDocumented>;
 
 } // end "nullability"
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 4d44f23c0f503..189e67e4eed0d 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1705,6 +1705,73 @@ void ASTContext::setRelocationInfoForCXXRecord(
   RelocatableClasses.insert({D, Info});
 }
 
+static bool primaryBaseHaseAddressDiscriminatedVTableAuthentication(
+    ASTContext &Context, const CXXRecordDecl *Class) {
+  if (!Class->isPolymorphic())
+    return false;
+  const CXXRecordDecl *BaseType = Context.baseForVTableAuthentication(Class);
+  using AuthAttr = VTablePointerAuthenticationAttr;
+  const AuthAttr *ExplicitAuth = BaseType->getAttr<AuthAttr>();
+  if (!ExplicitAuth)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  AuthAttr::AddressDiscriminationMode AddressDiscrimination =
+      ExplicitAuth->getAddressDiscrimination();
+  if (AddressDiscrimination == AuthAttr::DefaultAddressDiscrimination)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  return AddressDiscrimination == AuthAttr::AddressDiscrimination;
+}
+
+ASTContext::PointerAuthContent ASTContext::findPointerAuthContent(QualType T) {
+  assert(isPointerAuthenticationAvailable());
+
+  T = T.getCanonicalType();
+  if (T.hasAddressDiscriminatedPointerAuth())
+    return PointerAuthContent::AddressDiscriminatedData;
+  const RecordDecl *RD = T->getAsRecordDecl();
+  if (!RD)
+    return PointerAuthContent::None;
+
+  if (auto Existing = RecordContainsAddressDiscriminatedPointerAuth.find(RD);
+      Existing != RecordContainsAddressDiscriminatedPointerAuth.end())
+    return Existing->second;
+
+  PointerAuthContent Result = PointerAuthContent::None;
+
+  auto SaveResultAndReturn = [&]() -> PointerAuthContent {
+    auto [ResultIter, DidAdd] =
+        RecordContainsAddressDiscriminatedPointerAuth.try_emplace(RD, Result);
+    (void)ResultIter;
+    (void)DidAdd;
+    assert(DidAdd);
+    return Result;
+  };
+  auto ShouldContinueAfterUpdate = [&](PointerAuthContent NewResult) {
+    static_assert(PointerAuthContent::None <
+                  PointerAuthContent::AddressDiscriminatedVTable);
+    static_assert(PointerAuthContent::AddressDiscriminatedVTable <
+                  PointerAuthContent::AddressDiscriminatedData);
+    if (NewResult > Result)
+      Result = NewResult;
+    return Result != PointerAuthContent::AddressDiscriminatedData;
+  };
+  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (primaryBaseHaseAddressDiscriminatedVTableAuthentication(*this, CXXRD) &&
+        !ShouldContinueAfterUpdate(
+            PointerAuthContent::AddressDiscriminatedVTable))
+      return SaveResultAndReturn();
+    for (auto Base : CXXRD->bases()) {
+      if (!ShouldContinueAfterUpdate(findPointerAuthContent(Base.getType())))
+        return SaveResultAndReturn();
+    }
+  }
+  for (auto *FieldDecl : RD->fields()) {
+    if (!ShouldContinueAfterUpdate(
+            findPointerAuthContent(FieldDecl->getType())))
+      return SaveResultAndReturn();
+  }
+  return SaveResultAndReturn();
+}
+
 void ASTContext::addedLocalImportDecl(ImportDecl *Import) {
   assert(!Import->getNextLocalImport() &&
          "Import declaration already in the chain");
diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp
index 9ef44cd29ff87..f60307870ffcc 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.cpp
+++ b/clang/lib/AST/ByteCode/InterpBlock.cpp
@@ -69,20 +69,26 @@ void Block::cleanup() {
 void Block::replacePointer(Pointer *Old, Pointer *New) {
   assert(Old);
   assert(New);
+  assert(Old != New);
   if (IsStatic) {
     assert(!Pointers);
     return;
   }
-
 #ifndef NDEBUG
   assert(hasPointer(Old));
 #endif
 
-  removePointer(Old);
-  addPointer(New);
+  if (Old->Prev)
+    Old->Prev->Next = New;
+  if (Old->Next)
+    Old->Next->Prev = New;
+  New->Prev = Old->Prev;
+  New->Next = Old->Next;
+  if (Pointers == Old)
+    Pointers = New;
 
   Old->PointeeStorage.BS.Pointee = nullptr;
-
+  New->PointeeStorage.BS.Pointee = this;
 #ifndef NDEBUG
   assert(!hasPointer(Old));
   assert(hasPointer(New));
diff --git a/clang/lib/AST/ByteCode/InterpStack.cpp b/clang/lib/AST/ByteCode/InterpStack.cpp
index b183335dd5884..6b748d62b83bd 100644
--- a/clang/lib/AST/ByteCode/InterpStack.cpp
+++ b/clang/lib/AST/ByteCode/InterpStack.cpp
@@ -19,9 +19,7 @@
 using namespace clang;
 using namespace clang::interp;
 
-InterpStack::~InterpStack() { clear(); }
-
-void InterpStack::clear() {
+InterpStack::~InterpStack() {
   if (Chunk && Chunk->Next)
     std::free(Chunk->Next);
   if (Chunk)
@@ -33,6 +31,21 @@ void InterpStack::clear() {
 #endif
 }
 
+// We keep the last chunk around to reuse.
+void InterpStack::clear() {
+  if (!Chunk)
+    return;
+
+  if (Chunk->Next)
+    std::free(Chunk->Next);
+
+  assert(Chunk);
+  StackSize = 0;
+#ifndef NDEBUG
+  ItemTypes.clear();
+#endif
+}
+
 void InterpStack::clearTo(size_t NewSize) {
   assert(NewSize <= size());
   size_t ToShrink = size() - NewSize;
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index c59ac78210f81..19fac00ab8736 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -20,10 +20,18 @@
 #include "mlir/Support/LLVM.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
+using namespace llvm;
+
+static RValue emitLibraryCall(CIRGenFunction &cgf, const FunctionDecl *fd,
+                              const CallExpr *e, mlir::Operation *calleeValue) {
+  CIRGenCallee callee = CIRGenCallee::forDirect(calleeValue, GlobalDecl(fd));
+  return cgf.emitCall(e->getCallee()->getType(), callee, e, ReturnValueSlot());
+}
 
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
@@ -49,7 +57,34 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     }
   }
 
-  mlir::Location loc = getLoc(e->getExprLoc());
-  cgm.errorNYI(loc, "non constant foldable builtin calls");
+  const FunctionDecl *fd = gd.getDecl()->getAsFunction();
+
+  // If this is an alias for a lib function (e.g. __builtin_sin), emit
+  // the call using the normal call path, but using the unmangled
+  // version of the function name.
+  if (getContext().BuiltinInfo.isLibFunction(builtinID))
+    return emitLibraryCall(*this, fd, e,
+                           cgm.getBuiltinLibFunction(fd, builtinID));
+
+  cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
   return getUndefRValue(e->getType());
 }
+
+/// Given a builtin id for a function like "__builtin_fabsf", return a Function*
+/// for "fabsf".
+cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
+                                                unsigned builtinID) {
+  assert(astContext.BuiltinInfo.isLibFunction(builtinID));
+
+  // Get the name, skip over the __builtin_ prefix (if necessary). We may have
+  // to build this up so provide a small stack buffer to handle the vast
+  // majority of names.
+  llvm::SmallString<64> name;
+
+  assert(!cir::MissingFeatures::asmLabelAttr());
+  name = astContext.BuiltinInfo.getName(builtinID).substr(10);
+
+  GlobalDecl d(fd);
+  mlir::Type type = convertType(fd->getType());
+  return getOrCreateCIRFunction(name, type, d, /*forVTable=*/false);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 0d9064425fa95..af0e6ca822b8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -443,7 +443,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       mlir::Value v;
       if (arg.isAggregate())
         cgm.errorNYI(loc, "emitCall: aggregate call argument");
-      v = arg.getKnownRValue().getScalarVal();
+      v = arg.getKnownRValue().getValue();
 
       // We might have to widen integers, but we should never truncate.
       if (argType != v.getType() && mlir::isa<cir::IntType>(v.getType()))
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 2e43f10be132c..4f2046ad26d72 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -219,7 +219,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       const mlir::Value vector =
           builder.createLoad(loc, dst.getVectorAddress());
       const mlir::Value newVector = builder.create<cir::VecInsertOp>(
-          loc, vector, src.getScalarVal(), dst.getVectorIdx());
+          loc, vector, src.getValue(), dst.getVectorIdx());
       builder.createStore(loc, newVector, dst.getVectorAddress());
       return;
     }
@@ -232,7 +232,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
   assert(!cir::MissingFeatures::opLoadStoreObjC());
 
   assert(src.isScalar() && "Can't emit an aggregate store with this method");
-  emitStoreOfScalar(src.getScalarVal(), dst, isInit);
+  emitStoreOfScalar(src.getValue(), dst, isInit);
 }
 
 static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
@@ -949,7 +949,7 @@ LValue CIRGenFunction::emitCallExprLValue(const CallExpr *e) {
          "Can't have a scalar return unless the return type is a "
          "reference type!");
 
-  return makeNaturalAlignPointeeAddrLValue(rv.getScalarVal(), e->getType());
+  return makeNaturalAlignPointeeAddrLValue(rv.getValue(), e->getType());
 }
 
 LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 2ffe75a388e98..26070a6ca307a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -21,6 +21,8 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
                           bool isInit);
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
+
+  mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
 };
 
 } // namespace
@@ -66,6 +68,34 @@ mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
   return builder.create<cir::ConstantOp>(loc, complexAttr);
 }
 
+mlir::Value
+ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) {
+  auto ty = mlir::cast<cir::ComplexType>(cgf.convertType(il->getType()));
+  mlir::Type elementTy = ty.getElementType();
+  mlir::Location loc = cgf.getLoc(il->getExprLoc());
+
+  mlir::TypedAttr realValueAttr;
+  mlir::TypedAttr imagValueAttr;
+
+  if (mlir::isa<cir::IntType>(elementTy)) {
+    llvm::APInt imagValue = cast<IntegerLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::IntAttr::get(elementTy, 0);
+    imagValueAttr = cir::IntAttr::get(elementTy, imagValue);
+  } else {
+    assert(mlir::isa<cir::CIRFPTypeInterface>(elementTy) &&
+           "Expected complex element type to be floating-point");
+
+    llvm::APFloat imagValue =
+        cast<FloatingLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::FPAttr::get(
+        elementTy, llvm::APFloat::getZero(imagValue.getSemantics()));
+    imagValueAttr = cir::FPAttr::get(elementTy, imagValue);
+  }
+
+  auto complexAttr = cir::ConstComplexAttr::get(realValueAttr, imagValueAttr);
+  return builder.create<cir::ConstantOp>(loc, complexAttr);
+}
+
 mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   assert(e && getComplexType(e->getType()) &&
          "Invalid complex expression to emit");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index c41ab54be09ca..8b817f3f3d8d2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -254,8 +254,8 @@ class ConstExprEmitter
   }
 
   mlir::Attribute VisitStringLiteral(StringLiteral *e, QualType t) {
-    cgm.errorNYI(e->getBeginLoc(), "ConstExprEmitter::VisitStringLiteral");
-    return {};
+    // This is a string literal initializing an array in an initializer.
+    return cgm.getConstantArrayFromStringLiteral(e);
   }
 
   mlir::Attribute VisitObjCEncodeExpr(ObjCEncodeExpr *e, QualType t) {
@@ -329,6 +329,222 @@ emitArrayConstant(CIRGenModule &cgm, mlir::Type desiredType,
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+//                          ConstantLValueEmitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A struct which can be used to peephole certain kinds of finalization
+/// that normally happen during l-value emission.
+struct ConstantLValue {
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> value;
+  bool hasOffsetApplied;
+
+  ConstantLValue(std::nullptr_t) : value(nullptr), hasOffsetApplied(false) {}
+  ConstantLValue() : value(nullptr), hasOffsetApplied(false) {}
+};
+
+/// A helper class for emitting constant l-values.
+class ConstantLValueEmitter
+    : public ConstStmtVisitor<ConstantLValueEmitter, ConstantLValue> {
+  CIRGenModule &cgm;
+  ConstantEmitter &emitter;
+  const APValue &value;
+  QualType destType;
+
+  // Befriend StmtVisitorBase so that we don't have to expose Visit*.
+  friend StmtVisitorBase;
+
+public:
+  ConstantLValueEmitter(ConstantEmitter &emitter, const APValue &value,
+                        QualType destType)
+      : cgm(emitter.cgm), emitter(emitter), value(value), destType(destType) {}
+
+  mlir::Attribute tryEmit();
+
+private:
+  mlir::Attribute tryEmitAbsolute(mlir::Type destTy);
+  ConstantLValue tryEmitBase(const APValue::LValueBase &base);
+
+  ConstantLValue VisitStmt(const Stmt *s) { return nullptr; }
+  ConstantLValue VisitConstantExpr(const ConstantExpr *e);
+  ConstantLValue VisitCompoundLiteralExpr(const CompoundLiteralExpr *e);
+  ConstantLValue VisitStringLiteral(const StringLiteral *e);
+  ConstantLValue VisitObjCBoxedExpr(const ObjCBoxedExpr *e);
+  ConstantLValue VisitObjCEncodeExpr(const ObjCEncodeExpr *e);
+  ConstantLValue VisitObjCStringLiteral(const ObjCStringLiteral *e);
+  ConstantLValue VisitPredefinedExpr(const PredefinedExpr *e);
+  ConstantLValue VisitAddrLabelExpr(const AddrLabelExpr *e);
+  ConstantLValue VisitCallExpr(const CallExpr *e);
+  ConstantLValue VisitBlockExpr(const BlockExpr *e);
+  ConstantLValue VisitCXXTypeidExpr(const CXXTypeidExpr *e);
+  ConstantLValue
+  VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e);
+};
+
+} // namespace
+
+mlir::Attribute ConstantLValueEmitter::tryEmit() {
+  const APValue::LValueBase &base = value.getLValueBase();
+
+  // The destination type should be a pointer or reference
+  // type, but it might also be a cast thereof.
+  //
+  // FIXME: the chain of casts required should be reflected in the APValue.
+  // We need this in order to correctly handle things like a ptrtoint of a
+  // non-zero null pointer and addrspace casts that aren't trivially
+  // represented in LLVM IR.
+  mlir::Type destTy = cgm.getTypes().convertTypeForMem(destType);
+  assert(mlir::isa<cir::PointerType>(destTy));
+
+  // If there's no base at all, this is a null or absolute pointer,
+  // possibly cast back to an integer type.
+  if (!base)
+    return tryEmitAbsolute(destTy);
+
+  // Otherwise, try to emit the base.
+  ConstantLValue result = tryEmitBase(base);
+
+  // If that failed, we're done.
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> &value = result.value;
+  if (!value)
+    return {};
+
+  // Apply the offset if necessary and not already done.
+  if (!result.hasOffsetApplied) {
+    cgm.errorNYI("ConstantLValueEmitter: apply offset");
+    return {};
+  }
+
+  // Convert to the appropriate type; this could be an lvalue for
+  // an integer. FIXME: performAddrSpaceCast
+  if (mlir::isa<cir::PointerType>(destTy)) {
+    if (auto attr = mlir::dyn_cast<mlir::Attribute>(value))
+      return attr;
+    cgm.errorNYI("ConstantLValueEmitter: non-attribute pointer");
+    return {};
+  }
+
+  cgm.errorNYI("ConstantLValueEmitter: other?");
+  return {};
+}
+
+/// Try to emit an absolute l-value, such as a null pointer or an integer
+/// bitcast to pointer type.
+mlir::Attribute ConstantLValueEmitter::tryEmitAbsolute(mlir::Type destTy) {
+  // If we're producing a pointer, this is easy.
+  auto destPtrTy = mlir::cast<cir::PointerType>(destTy);
+  return cgm.getBuilder().getConstPtrAttr(
+      destPtrTy, value.getLValueOffset().getQuantity());
+}
+
+ConstantLValue
+ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
+  // Handle values.
+  if (const ValueDecl *d = base.dyn_cast<const ValueDecl *>()) {
+    // The constant always points to the canonical declaration. We want to look
+    // at properties of the most recent declaration at the point of emission.
+    d = cast<ValueDecl>(d->getMostRecentDecl());
+
+    if (d->hasAttr<WeakRefAttr>()) {
+      cgm.errorNYI(d->getSourceRange(),
+                   "ConstantLValueEmitter: emit pointer base for weakref");
+      return {};
+    }
+
+    if (auto *fd = dyn_cast<FunctionDecl>(d)) {
+      cgm.errorNYI(fd->getSourceRange(),
+                   "ConstantLValueEmitter: function decl");
+      return {};
+    }
+
+    if (auto *vd = dyn_cast<VarDecl>(d)) {
+      cgm.errorNYI(vd->getSourceRange(), "ConstantLValueEmitter: var decl");
+      return {};
+    }
+  }
+
+  // Handle typeid(T).
+  if (base.dyn_cast<TypeInfoLValue>()) {
+    cgm.errorNYI("ConstantLValueEmitter: typeid");
+    return {};
+  }
+
+  // Otherwise, it must be an expression.
+  return Visit(base.get<const Expr *>());
+}
+
+ConstantLValue ConstantLValueEmitter::VisitConstantExpr(const ConstantExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: constant expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCompoundLiteralExpr(const CompoundLiteralExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: compound literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitStringLiteral(const StringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCEncodeExpr(const ObjCEncodeExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc encode expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: objc string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCBoxedExpr(const ObjCBoxedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc boxed expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitPredefinedExpr(const PredefinedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: predefined expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitAddrLabelExpr(const AddrLabelExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: addr label expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitCallExpr(const CallExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: call expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitBlockExpr(const BlockExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: block expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCXXTypeidExpr(const CXXTypeidExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: cxx typeid expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitMaterializeTemporaryExpr(
+    const MaterializeTemporaryExpr *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: materialize temporary expr");
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 //                             ConstantEmitter
 //===----------------------------------------------------------------------===//
@@ -556,23 +772,8 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value,
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate member pointer");
     return {};
   }
-  case APValue::LValue: {
-
-    if (value.getLValueBase()) {
-      cgm.errorNYI("non-null pointer initialization");
-    } else {
-
-      mlir::Type desiredType = cgm.convertType(destType);
-      if (const cir::PointerType ptrType =
-              mlir::dyn_cast<cir::PointerType>(desiredType)) {
-        return builder.getConstPtrAttr(ptrType,
-                                       value.getLValueOffset().getQuantity());
-      } else {
-        llvm_unreachable("non-pointer variable initialized with a pointer");
-      }
-    }
-    return {};
-  }
+  case APValue::LValue:
+    return ConstantLValueEmitter(*this, value, destType).tryEmit();
   case APValue::Struct:
   case APValue::Union:
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate struct or union");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 75b4d2a637e6e..8d0db5cd0a1e5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -131,11 +131,11 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value emitLoadOfLValue(const Expr *e) {
     LValue lv = cgf.emitLValue(e);
     // FIXME: add some akin to EmitLValueAlignmentAssumption(E, V);
-    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
   }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
-    return cgf.emitLoadOfLValue(lv, loc).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, loc).getValue();
   }
 
   // l-values
@@ -400,10 +400,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       cgf.cgm.errorNYI(e->getSourceRange(), "Atomic inc/dec");
       // TODO(cir): This is not correct, but it will produce reasonable code
       // until atomic operations are implemented.
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     } else {
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     }
 
@@ -1805,7 +1805,7 @@ mlir::Value ScalarExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
 
-  auto v = cgf.emitCallExpr(e).getScalarVal();
+  auto v = cgf.emitCallExpr(e).getValue();
   assert(!cir::MissingFeatures::emitLValueAlignmentAssumption());
   return v;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 03606dba200fd..0ea2d9f9c8229 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -301,6 +301,10 @@ class CIRGenModule : public CIRGenTypeCache {
                                 cir::FuncType funcType,
                                 const clang::FunctionDecl *funcDecl);
 
+  /// Given a builtin id for a function like "__builtin_fabsf", return a
+  /// Function* for "fabsf".
+  cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID);
+
   mlir::IntegerAttr getSize(CharUnits size) {
     return builder.getSizeFromCharUnits(size);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 019a44636ce3c..9193f6f1cd996 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -391,8 +391,7 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
     // If this function returns a reference, take the address of the
     // expression rather than the value.
     RValue result = emitReferenceBindingToExpr(rv);
-    builder.CIRBaseBuilderTy::createStore(loc, result.getScalarVal(),
-                                          *fnRetAlloca);
+    builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), *fnRetAlloca);
   } else {
     mlir::Value value = nullptr;
     switch (CIRGenFunction::getEvaluationKind(rv->getType())) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index c1e08ba1e9b67..84972fc7f9118 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -33,11 +33,7 @@ class RValue {
   enum Flavor { Scalar, Complex, Aggregate };
 
   union {
-    // Stores first and second value.
-    struct {
-      mlir::Value first;
-      mlir::Value second;
-    } vals;
+    mlir::Value value;
 
     // Stores aggregate address.
     Address aggregateAddr;
@@ -47,7 +43,7 @@ class RValue {
   unsigned flavor : 2;
 
 public:
-  RValue() : vals{nullptr, nullptr}, flavor(Scalar) {}
+  RValue() : value(nullptr), flavor(Scalar) {}
 
   bool isScalar() const { return flavor == Scalar; }
   bool isComplex() const { return flavor == Complex; }
@@ -56,14 +52,9 @@ class RValue {
   bool isVolatileQualified() const { return isVolatile; }
 
   /// Return the value of this scalar value.
-  mlir::Value getScalarVal() const {
+  mlir::Value getValue() const {
     assert(isScalar() && "Not a scalar!");
-    return vals.first;
-  }
-
-  /// Return the real/imag components of this complex value.
-  std::pair<mlir::Value, mlir::Value> getComplexVal() const {
-    return std::make_pair(vals.first, vals.second);
+    return value;
   }
 
   /// Return the value of the address of the aggregate.
@@ -83,22 +74,20 @@ class RValue {
 
   static RValue get(mlir::Value v) {
     RValue er;
-    er.vals.first = v;
+    er.value = v;
     er.flavor = Scalar;
     er.isVolatile = false;
     return er;
   }
 
-  static RValue getComplex(mlir::Value v1, mlir::Value v2) {
+  static RValue getComplex(mlir::Value v) {
     RValue er;
-    er.vals = {v1, v2};
+    er.value = v;
     er.flavor = Complex;
     er.isVolatile = false;
     return er;
   }
-  static RValue getComplex(const std::pair<mlir::Value, mlir::Value> &c) {
-    return getComplex(c.first, c.second);
-  }
+
   // FIXME: Aggregate rvalues need to retain information about whether they are
   // volatile or not.  Remove default to find all places that probably get this
   // wrong.
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 5578d4f5825a9..3fcb0213b219a 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1589,6 +1589,104 @@ OpFoldResult cir::VecExtractOp::fold(FoldAdaptor adaptor) {
   return elements[index];
 }
 
+//===----------------------------------------------------------------------===//
+// VecCmpOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::VecCmpOp::fold(FoldAdaptor adaptor) {
+  auto lhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getLhs());
+  auto rhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getRhs());
+  if (!lhsVecAttr || !rhsVecAttr)
+    return {};
+
+  mlir::Type inputElemTy =
+      mlir::cast<cir::VectorType>(lhsVecAttr.getType()).getElementType();
+  if (!isAnyIntegerOrFloatingPointType(inputElemTy))
+    return {};
+
+  cir::CmpOpKind opKind = adaptor.getKind();
+  mlir::ArrayAttr lhsVecElhs = lhsVecAttr.getElts();
+  mlir::ArrayAttr rhsVecElhs = rhsVecAttr.getElts();
+  uint64_t vecSize = lhsVecElhs.size();
+
+  SmallVector<mlir::Attribute, 16> elements(vecSize);
+  bool isIntAttr = vecSize && mlir::isa<cir::IntAttr>(lhsVecElhs[0]);
+  for (uint64_t i = 0; i < vecSize; i++) {
+    mlir::Attribute lhsAttr = lhsVecElhs[i];
+    mlir::Attribute rhsAttr = rhsVecElhs[i];
+    int cmpResult = 0;
+    switch (opKind) {
+    case cir::CmpOpKind::lt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::le: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::gt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ge: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::eq: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() ==
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() ==
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ne: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() !=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() !=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    }
+
+    elements[i] = cir::IntAttr::get(getType().getElementType(), cmpResult);
+  }
+
+  return cir::ConstVectorAttr::get(
+      getType(), mlir::ArrayAttr::get(getContext(), elements));
+}
+
 //===----------------------------------------------------------------------===//
 // VecShuffleOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 20c634d6c66f6..f07e234e5e84c 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -141,7 +141,7 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            ComplexCreateOp, VecCreateOp, VecExtractOp, VecShuffleOp,
+            ComplexCreateOp, VecCmpOp, VecCreateOp, VecExtractOp, VecShuffleOp,
             VecShuffleDynamicOp, VecTernaryOp>(op))
       ops.push_back(op);
   });
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 571ff53b7d644..585411bc59e16 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -471,14 +471,6 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
   }
 }
 
-void CGHLSLRuntime::setHLSLFunctionAttributes(const FunctionDecl *FD,
-                                              llvm::Function *Fn) {
-  if (FD->isInExportDeclContext()) {
-    const StringRef ExportAttrKindStr = "hlsl.export";
-    Fn->addFnAttr(ExportAttrKindStr);
-  }
-}
-
 static void gatherFunctions(SmallVectorImpl<Function *> &Fns, llvm::Module &M,
                             bool CtorOrDtor) {
   const auto *GV =
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 13d0633e9b1c0..70a09795d02fe 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1267,7 +1267,6 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     if (FD->hasAttr<HLSLShaderAttr>()) {
       CGM.getHLSLRuntime().emitEntryFunction(FD, Fn);
     }
-    CGM.getHLSLRuntime().setHLSLFunctionAttributes(FD, Fn);
   }
 
   EmitFunctionProlog(*CurFnInfo, CurFn, Args);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c036902b0b130..c27168e4c4bfe 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1319,8 +1319,10 @@ void CodeGenModule::Release() {
                               1);
 
   // Enable unwind v2 (epilog).
-  if (CodeGenOpts.WinX64EHUnwindV2)
-    getModule().addModuleFlag(llvm::Module::Warning, "winx64-eh-unwindv2", 1);
+  if (CodeGenOpts.getWinX64EHUnwindV2() != llvm::WinX64EHUnwindV2Mode::Disabled)
+    getModule().addModuleFlag(
+        llvm::Module::Warning, "winx64-eh-unwindv2",
+        static_cast<unsigned>(CodeGenOpts.getWinX64EHUnwindV2()));
 
   // Indicate whether this Module was compiled with -fopenmp
   if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd)
@@ -1666,6 +1668,11 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
     return;
   }
 
+  if (Context.getLangOpts().HLSL && !D->isInExportDeclContext()) {
+    GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    return;
+  }
+
   if (GV->hasDLLExportStorageClass() || GV->hasDLLImportStorageClass()) {
     // Reject incompatible dlllstorage and visibility annotations.
     if (!LV.isVisibilityExplicit())
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 060f76fb653c9..2f86b6633df1c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4330,6 +4330,14 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
     YcArg = YuArg = nullptr;
   }
 
+  if (Args.hasArg(options::OPT_include_pch) &&
+      Args.hasArg(options::OPT_ignore_pch)) {
+    // If -ignore-pch is used, -include-pch is disabled. Since -emit-pch is
+    // CC1option, it will not be added to command argments if -ignore-pch is
+    // used.
+    Args.eraseArg(options::OPT_include_pch);
+  }
+
   bool LinkOnly = phases::Link == FinalPhase && Inputs.size() > 0;
   for (auto &I : Inputs) {
     types::ID InputType = I.first;
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d8168ed15febd..0fbfe6c77f342 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,6 +31,40 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -95,7 +129,8 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
+static std::string computeClangRuntimesSysRoot(const Driver &D,
+                                               bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -108,56 +143,123 @@ static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   return std::string(SysRootDir);
 }
 
-BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
-                     const ArgList &Args)
-    : ToolChain(D, Triple, Args),
-      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
-  getProgramPaths().push_back(getDriver().Dir);
-
-  findMultilibs(D, Triple, Args);
-  SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
-    for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
-      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-      getFilePaths().push_back(std::string(Dir));
-      getLibraryPaths().push_back(std::string(Dir));
-    }
+// Only consider the GCC toolchain based on the values provided through the
+// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
+// whether the GCC toolchain was initialized successfully.
+bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
+                                    const llvm::opt::ArgList &Args) {
+  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
+      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+    GCCInstallation.init(Triple, Args);
+    return GCCInstallation.isValid();
   }
+  return false;
 }
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
+// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
+// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
+// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
+// `bin/../<target-triple>/lib` directory.
+static bool detectGCCToolchainAdjacent(const Driver &D) {
+  SmallString<128> GCCDir;
+  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
+                          "lib/crt0.o");
+  return llvm::sys::fs::exists(GCCDir);
 }
 
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
+// If no sysroot is provided the driver will first attempt to infer it from the
+// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
+// location of a GCC toolchain.
+// If neither flag is used, the sysroot defaults to either:
+//    - `bin/../<target-triple>`
+//    - `bin/../lib/clang-runtimes/<target-triple>`
+//
+// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
+// does not exist relative to the driver.
+std::string BareMetal::computeSysRoot() const {
+  // Use Baremetal::sysroot if it has already been set.
+  if (!SysRoot.empty())
+    return SysRoot;
+
+  // Use the sysroot specified via the `--sysroot` command-line flag, if
+  // provided.
+  const Driver &D = getDriver();
+  if (!D.SysRoot.empty())
+    return D.SysRoot;
 
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
+  // Attempt to infer sysroot from a valid GCC installation.
+  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
+  SmallString<128> inferredSysRoot;
+  if (IsGCCInstallationValid) {
+    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
+                            "..", GCCInstallation.getTriple().str());
+  } else if (detectGCCToolchainAdjacent(D)) {
+    // Use the triple as provided to the driver. Unlike the parsed triple
+    // this has not been normalized to always contain every field.
+    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
+  }
+  // If a valid sysroot was inferred and exists, use it
+  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
+    return std::string(inferredSysRoot);
 
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
+  // Use the clang-runtimes path.
+  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
+}
 
-  return Triple.getEnvironmentName() == "elf";
+static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
+                                  const Multilib &Multilib,
+                                  StringRef InstallPath,
+                                  ToolChain::path_list &Paths) {
+  if (const auto &PathsCallback = Multilibs.filePathsCallback())
+    for (const auto &Path : PathsCallback(Multilib))
+      addPathIfExists(D, InstallPath + Path, Paths);
 }
 
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
+// GCC mutltilibs will only work for those targets that have their multlib
+// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
+// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
+// in GCCInstallation.
+BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
+                     const ArgList &Args)
+    : Generic_ELF(D, Triple, Args) {
+  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
+  std::string ComputedSysRoot = computeSysRoot();
+  if (IsGCCInstallationValid) {
+    if (!isRISCVBareMetal(Triple))
+      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
+
+    Multilibs = GCCInstallation.getMultilibs();
+    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+
+    path_list &Paths = getFilePaths();
+    // Add toolchain/multilib specific file paths.
+    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
+                          GCCInstallation.getInstallPath(), Paths);
+    // Adding filepath for locating crt{begin,end}.o files.
+    Paths.push_back(GCCInstallation.getInstallPath().str());
+    // Adding filepath for locating crt0.o file.
+    Paths.push_back(ComputedSysRoot + "/lib");
+
+    ToolChain::path_list &PPaths = getProgramPaths();
+    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
+    // directory off of the parent of the GCC installation.
+    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
+                           GCCInstallation.getTriple().str() + "/bin")
+                         .str());
+    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
+  } else {
+    getProgramPaths().push_back(getDriver().Dir);
+    findMultilibs(D, Triple, Args);
+    const SmallString<128> SysRootDir(computeSysRoot());
+    if (!SysRootDir.empty()) {
+      for (const Multilib &M : getOrderedMultilibs()) {
+        SmallString<128> Dir(SysRootDir);
+        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+        getFilePaths().push_back(std::string(Dir));
+        getLibraryPaths().push_back(std::string(Dir));
+      }
+    }
+  }
 }
 
 static void
@@ -216,7 +318,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -234,7 +336,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -242,7 +344,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple)) {
+  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -263,8 +365,6 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
-std::string BareMetal::computeSysRoot() const { return SysRoot; }
-
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -292,10 +392,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
+  const SmallString<128> SysRootDir(computeSysRoot());
+  if (!SysRootDir.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
+      SmallString<128> Dir(SysRootDir);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -309,6 +409,19 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
+void BareMetal::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  if (!IsGCCInstallationValid)
+    return;
+  const GCCVersion &Version = GCCInstallation.getVersion();
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+  const Multilib &Multilib = GCCInstallation.getMultilib();
+  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
+                           TripleStr, Multilib.includeSuffix(), DriverArgs,
+                           CC1Args);
+}
+
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -339,23 +452,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-    case ToolChain::CST_Libcxx: {
-      SmallString<128> P(D.Dir);
-      llvm::sys::path::append(P, "..", "include");
-      AddCXXIncludePath(P);
-      break;
-    }
-    case ToolChain::CST_Libstdcxx:
-      // We only support libc++ toolchain installation.
-      break;
+  case ToolChain::CST_Libcxx: {
+    SmallString<128> P(D.Dir);
+    llvm::sys::path::append(P, "..", "include");
+    AddCXXIncludePath(P);
+    break;
+  }
+  case ToolChain::CST_Libstdcxx:
+    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
+    break;
   }
 
-  std::string SysRoot(computeSysRoot());
-  if (SysRoot.empty())
+  std::string SysRootDir(computeSysRoot());
+  if (SysRootDir.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRoot);
+    SmallString<128> Dir(SysRootDir);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index f6295bda0a6a2..930f8584e6435 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
+#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -19,7 +20,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -35,7 +36,8 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool useIntegratedAs() const override { return true; }
+  bool initGCCInstallation(const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -48,9 +50,15 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
+  UnwindTableLevel
+  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
+    return UnwindTableLevel::None;
+  }
+
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
+
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
@@ -67,6 +75,9 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -80,6 +91,8 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 
   std::string SysRoot;
 
+  bool IsGCCInstallationValid;
+
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -104,7 +117,7 @@ class LLVM_LIBRARY_VISIBILITY StaticLibTool : public Tool {
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8556bcadf0915..a78a1c8978183 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5202,7 +5202,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-module-interface");
     else if (JA.getType() == types::TY_HeaderUnit)
       CmdArgs.push_back("-emit-header-unit");
-    else
+    else if (!Args.hasArg(options::OPT_ignore_pch))
       CmdArgs.push_back("-emit-pch");
   } else if (isa<VerifyPCHJobAction>(JA)) {
     CmdArgs.push_back("-verify-pch");
@@ -5259,7 +5259,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (JA.getType() == types::TY_PP_Asm) {
       CmdArgs.push_back("-S");
     } else if (JA.getType() == types::TY_AST) {
-      CmdArgs.push_back("-emit-pch");
+      if (!Args.hasArg(options::OPT_ignore_pch))
+        CmdArgs.push_back("-emit-pch");
     } else if (JA.getType() == types::TY_ModuleFile) {
       CmdArgs.push_back("-module-file-info");
     } else if (JA.getType() == types::TY_RewrittenObjC) {
@@ -5701,11 +5702,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
-    } else if (Name == "libmvec" || Name == "AMDLIBM") {
+    } else if (Name == "AMDLIBM") {
       if (Triple.getArch() != llvm::Triple::x86 &&
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
+    } else if (Name == "libmvec") {
+      if (Triple.getArch() != llvm::Triple::x86 &&
+          Triple.getArch() != llvm::Triple::x86_64 &&
+          Triple.getArch() != llvm::Triple::aarch64 &&
+          Triple.getArch() != llvm::Triple::aarch64_be)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
     } else if (Name == "SLEEF" || Name == "ArmPL") {
       if (Triple.getArch() != llvm::Triple::aarch64 &&
           Triple.getArch() != llvm::Triple::aarch64_be &&
@@ -7360,8 +7368,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   // Unwind v2 (epilog) information for x64 Windows.
-  Args.addOptInFlag(CmdArgs, options::OPT_fwinx64_eh_unwindv2,
-                    options::OPT_fno_winx64_eh_unwindv2);
+  Args.AddLastArg(CmdArgs, options::OPT_winx64_eh_unwindv2);
 
   // C++ "sane" operator new.
   Args.addOptOutFlag(CmdArgs, options::OPT_fassume_sane_operator_new,
@@ -8418,8 +8425,10 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
     CmdArgs.push_back("-fms-kernel");
 
   // Unwind v2 (epilog) information for x64 Windows.
-  if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
-    CmdArgs.push_back("-fwinx64-eh-unwindv2");
+  if (Args.hasArg(options::OPT__SLASH_d2epilogunwindrequirev2))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=required");
+  else if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=best-effort");
 
   for (const Arg *A : Args.filtered(options::OPT__SLASH_guard)) {
     StringRef GuardArgs = A->getValue();
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 1c165bbfe84f5..146dc8bbd5313 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -91,7 +91,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--execute-only");
 
     std::string CPU = getCPUName(D, Args, Triple);
-    if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
+    if (Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                     options::OPT_mno_fix_cortex_a53_843419, true) &&
+        (CPU.empty() || CPU == "generic" || CPU == "cortex-a53"))
       CmdArgs.push_back("--fix-cortex-a53-843419");
   }
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9c68c5c6de2b2..9203bbc91b0bb 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -402,7 +402,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   // Most Android ARM64 targets should enable the linker fix for erratum
   // 843419. Only non-Cortex-A53 devices are allowed to skip this flag.
-  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily)) {
+  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily) &&
+      Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                   options::OPT_mno_fix_cortex_a53_843419, true)) {
     std::string CPU = getCPUName(D, Args, Triple);
     if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
       CmdArgs.push_back("--fix-cortex-a53-843419");
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c1c9d2e8c7b79..c96d209c1fc0c 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -341,6 +341,7 @@ set(cuda_wrapper_files
 )
 
 set(cuda_wrapper_bits_files
+  cuda_wrappers/bits/c++config.h
   cuda_wrappers/bits/shared_ptr_base.h
   cuda_wrappers/bits/basic_string.h
   cuda_wrappers/bits/basic_string.tcc
diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 8b230af6f6647..5e13f3f78df70 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
   return ret;
 }
 
+#pragma push_macro("__INTRINSIC_LOAD")
+#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  \
+                         __Clobber)                                            \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __TmpType __ret;                                                           \
+    asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber);        \
+    return (__DeclType)__ret;                                                  \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD2")
+#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1}, [%2];"                                              \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y)                               \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)(__tmp.x);                                        \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    return __ret;                                                              \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD4")
+#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1,%2,%3}, [%4];"                                        \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z),          \
+          __AsmType(__tmp.w)                                                   \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)__tmp.x;                                          \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    __ret.z = (__ElementType)__tmp.z;                                          \
+    __ret.w = (__ElementType)__tmp.w;                                          \
+    return __ret;                                                              \
+  }
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
+
+inline __device__ long __ldcg(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
+                 unsigned long long, "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
+                 "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
+                  "=f", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
+                  "=f", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
+                  "=d", : "memory");
+
+inline __device__ long __ldcv(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
+
+#pragma pop_macro("__INTRINSIC_LOAD")
+#pragma pop_macro("__INTRINSIC_LOAD2")
+#pragma pop_macro("__INTRINSIC_LOAD4")
+
+inline __device__ long __ldcs(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+#pragma push_macro("__INTRINSIC_STORE")
+#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp = (__TmpType)__value;                                      \
+    asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");       \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE2")
+#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),            \
+        __AsmType(__tmp.y)                                                     \
+        : "memory");                                                           \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE4")
+#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    __tmp.z = (__ElementType)(__value.z);                                      \
+    __tmp.w = (__ElementType)(__value.w);                                      \
+    asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),      \
+        __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)             \
+        : "memory");                                                           \
+  }
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
+                  "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
+                  unsigned long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
+__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
+
+#pragma pop_macro("__INTRINSIC_STORE")
+#pragma pop_macro("__INTRINSIC_STORE2")
+#pragma pop_macro("__INTRINSIC_STORE4")
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000
diff --git a/clang/lib/Headers/cuda_wrappers/bits/c++config.h b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
new file mode 100644
index 0000000000000..eafa13a9cc640
--- /dev/null
+++ b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
@@ -0,0 +1,51 @@
+// libstdc++ uses the non-constexpr function std::__glibcxx_assert_fail()
+// to trigger compilation errors when the __glibcxx_assert(cond) macro
+// is used in a constexpr context.
+// Compilation fails when using code from the libstdc++ (such as std::array) on
+// device code, since these assertions invoke a non-constexpr host function from
+// device code.
+//
+// To work around this issue, we declare our own device version of the function
+
+#ifndef __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+#define __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+
+#include_next <bits/c++config.h>
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+
+#ifdef _GLIBCXX_VERBOSE_ASSERT
+__attribute__((device, noreturn)) inline void
+__glibcxx_assert_fail(const char *file, int line, const char *function,
+                      const char *condition) noexcept {
+  if (file && function && condition)
+    __builtin_printf("%s:%d: %s: Assertion '%s' failed.\n", file, line,
+                     function, condition);
+  else if (function)
+    __builtin_printf("%s: Undefined behavior detected.\n", function);
+  __builtin_abort();
+}
+#endif
+
+#endif
+__attribute__((device, noreturn, __always_inline__,
+               __visibility__("default"))) inline void
+__glibcxx_assert_fail(...) noexcept {
+  __builtin_abort();
+}
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif
diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt
index 00fde537bb9c6..e6cbf3b868b7d 100644
--- a/clang/lib/Parse/CMakeLists.txt
+++ b/clang/lib/Parse/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  FrontendHLSL
   FrontendOpenMP
   MC
   MCParser
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index f31c9265a0074..c1493a5bfd3b3 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -29,6 +29,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCodeCompletion.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <optional>
 
@@ -760,6 +761,10 @@ Parser::DeclGroupPtrTy Parser::ParseUsingDeclaration(
 
     Decl *AD = ParseAliasDeclarationAfterDeclarator(
         TemplateInfo, UsingLoc, D, DeclEnd, AS, Attrs, &DeclFromDeclSpec);
+
+    if (!AD)
+      return nullptr;
+
     return Actions.ConvertDeclToDeclGroup(AD, DeclFromDeclSpec);
   }
 
@@ -4899,7 +4904,7 @@ void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) {
   }
 }
 
-void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
+void Parser::ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
   assert(Tok.is(tok::identifier) &&
          "Expected an identifier to denote which MS attribute to consider");
   IdentifierInfo *RootSignatureIdent = Tok.getIdentifierInfo();
@@ -4941,18 +4946,14 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
 
   // Construct our identifier
   StringRef Signature = StrLiteral.value()->getString();
-  auto Hash = llvm::hash_value(Signature);
-  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
-  IdentifierInfo *DeclIdent = &(Actions.getASTContext().Idents.get(IdStr));
-
-  LookupResult R(Actions, DeclIdent, SourceLocation(),
-                 Sema::LookupOrdinaryName);
-  // Check if we have already found a decl of the same name, if we haven't
-  // then parse the root signature string and construct the in-memory elements
-  if (!Actions.LookupQualifiedName(R, Actions.CurContext)) {
+  auto [DeclIdent, Found] =
+      Actions.HLSL().ActOnStartRootSignatureDecl(Signature);
+  // If we haven't found an already defined DeclIdent then parse the root
+  // signature string and construct the in-memory elements
+  if (!Found) {
+    // Offset location 1 to account for '"'
     SourceLocation SignatureLoc =
-        StrLiteral.value()->getExprLoc().getLocWithOffset(
-            1); // offset 1 for '"'
+        StrLiteral.value()->getExprLoc().getLocWithOffset(1);
     // Invoke the root signature parser to construct the in-memory constructs
     hlsl::RootSignatureLexer Lexer(Signature, SignatureLoc);
     SmallVector<llvm::hlsl::rootsig::RootElement> RootElements;
@@ -4962,12 +4963,9 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
       return;
     }
 
-    // Create the Root Signature
-    auto *SignatureDecl = HLSLRootSignatureDecl::Create(
-        Actions.getASTContext(), /*DeclContext=*/Actions.CurContext,
-        RootSignatureLoc, DeclIdent, RootElements);
-    SignatureDecl->setImplicit();
-    Actions.PushOnScopeChains(SignatureDecl, getCurScope());
+    // Construct the declaration.
+    Actions.HLSL().ActOnFinishRootSignatureDecl(RootSignatureLoc, DeclIdent,
+                                                RootElements);
   }
 
   // Create the arg for the ParsedAttr
@@ -5010,7 +5008,7 @@ void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) {
       if (Tok.getIdentifierInfo()->getName() == "uuid")
         ParseMicrosoftUuidAttributeArgs(Attrs);
       else if (Tok.getIdentifierInfo()->getName() == "RootSignature")
-        ParseMicrosoftRootSignatureAttributeArgs(Attrs);
+        ParseHLSLRootSignatureAttributeArgs(Attrs);
       else {
         IdentifierInfo *II = Tok.getIdentifierInfo();
         SourceLocation NameLoc = Tok.getLocation();
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5cffd82e3372e..1bf72e5bb7b9d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -62,6 +62,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
@@ -13962,31 +13963,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
     // We allow integer constant expressions in all cases.
     } else if (DclT->isIntegralOrEnumerationType()) {
-      // Check whether the expression is a constant expression.
-      SourceLocation Loc;
       if (getLangOpts().CPlusPlus11 && DclT.isVolatileQualified())
         // In C++11, a non-constexpr const static data member with an
         // in-class initializer cannot be volatile.
         Diag(VDecl->getLocation(), diag::err_in_class_initializer_volatile);
-      else if (Init->isValueDependent())
-        ; // Nothing to check.
-      else if (Init->isIntegerConstantExpr(Context, &Loc))
-        ; // Ok, it's an ICE!
-      else if (Init->getType()->isScopedEnumeralType() &&
-               Init->isCXX11ConstantExpr(Context))
-        ; // Ok, it is a scoped-enum constant expression.
-      else if (Init->isEvaluatable(Context)) {
-        // If we can constant fold the initializer through heroics, accept it,
-        // but report this as a use of an extension for -pedantic.
-        Diag(Loc, diag::ext_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-      } else {
-        // Otherwise, this is some crazy unknown case.  Report the issue at the
-        // location provided by the isIntegerConstantExpr failed check.
-        Diag(Loc, diag::err_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-        VDecl->setInvalidDecl();
-      }
 
     // We allow foldable floating-point constants as an extension.
     } else if (DclT->isFloatingType()) { // also permits complex, which is ok
@@ -14714,6 +14694,17 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
       // Compute and cache the constant value, and remember that we have a
       // constant initializer.
       if (HasConstInit) {
+        if (var->isStaticDataMember() && !var->isInline() &&
+            var->getLexicalDeclContext()->isRecord() &&
+            type->isIntegralOrEnumerationType()) {
+          // In C++98, in-class initialization for a static data member must
+          // be an integer constant expression.
+          SourceLocation Loc;
+          if (!Init->isIntegerConstantExpr(Context, &Loc)) {
+            Diag(Loc, diag::ext_in_class_initializer_non_constant)
+                << Init->getSourceRange();
+          }
+        }
         (void)var->checkForConstantInitialization(Notes);
         Notes.clear();
       } else if (CacheCulprit) {
@@ -14749,6 +14740,13 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
           << Attr->getRange() << Attr->isConstinit();
       for (auto &it : Notes)
         Diag(it.first, it.second);
+    } else if (var->isStaticDataMember() && !var->isInline() &&
+               var->getLexicalDeclContext()->isRecord()) {
+      Diag(var->getLocation(), diag::err_in_class_initializer_non_constant)
+          << Init->getSourceRange();
+      for (auto &it : Notes)
+        Diag(it.first, it.second);
+      var->setInvalidDecl();
     } else if (IsGlobal &&
                !getDiagnostics().isIgnored(diag::warn_global_constructor,
                                            var->getLocation())) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 413eff4aa294a..ebc43157d4c2b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7176,6 +7176,7 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo,
   //   void func(char *para[(int [1]){ 0 }[0]);
   const Scope *S = getCurScope();
   bool IsFileScope = !CurContext->isFunctionOrMethod() &&
+                     !S->isInCFunctionScope() &&
                      (!S || !S->isFunctionPrototypeScope());
 
   // In C, compound literals are l-values for some reason.
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index ba491b6134293..b55f4fd786b58 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -978,6 +978,31 @@ void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS,
       << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str());
 }
 
+std::pair<IdentifierInfo *, bool>
+SemaHLSL::ActOnStartRootSignatureDecl(StringRef Signature) {
+  llvm::hash_code Hash = llvm::hash_value(Signature);
+  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
+  IdentifierInfo *DeclIdent = &(getASTContext().Idents.get(IdStr));
+
+  // Check if we have already found a decl of the same name.
+  LookupResult R(SemaRef, DeclIdent, SourceLocation(),
+                 Sema::LookupOrdinaryName);
+  bool Found = SemaRef.LookupQualifiedName(R, SemaRef.CurContext);
+  return {DeclIdent, Found};
+}
+
+void SemaHLSL::ActOnFinishRootSignatureDecl(
+    SourceLocation Loc, IdentifierInfo *DeclIdent,
+    SmallVector<llvm::hlsl::rootsig::RootElement> &Elements) {
+
+  auto *SignatureDecl = HLSLRootSignatureDecl::Create(
+      SemaRef.getASTContext(), /*DeclContext=*/SemaRef.CurContext, Loc,
+      DeclIdent, Elements);
+
+  SignatureDecl->setImplicit();
+  SemaRef.PushOnScopeChains(SignatureDecl, SemaRef.getCurScope());
+}
+
 void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getNumArgs() != 1) {
     Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;
@@ -2206,8 +2231,9 @@ static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
   if (VecTyA)
-    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
-                                          VectorKind::Generic);
+    ReturnType =
+        S->Context.getExtVectorType(ReturnType, VecTyA->getNumElements());
+
   TheCall->setType(ReturnType);
 }
 
@@ -2520,8 +2546,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
 
     if (auto *VecTy = EltTy->getAs<VectorType>()) {
       EltTy = VecTy->getElementType();
-      ResTy = SemaRef.Context.getVectorType(ResTy, VecTy->getNumElements(),
-                                            VecTy->getVectorKind());
+      ResTy = SemaRef.Context.getExtVectorType(ResTy, VecTy->getNumElements());
     }
 
     if (!EltTy->isIntegerType()) {
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1738ab4466001..4dbb2450857e0 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -188,6 +188,7 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
       return false;
   }
 
+  bool IsUnion = D->isUnion();
   for (const FieldDecl *Field : D->fields()) {
     if (Field->getType()->isDependentType())
       continue;
@@ -197,6 +198,12 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
     // of a trivially relocatable type
     if (!SemaRef.IsCXXTriviallyRelocatableType(Field->getType()))
       return false;
+
+    // A union contains values with address discriminated pointer auth
+    // cannot be relocated.
+    if (IsUnion && SemaRef.Context.containsAddressDiscriminatedPointerAuth(
+                       Field->getType()))
+      return false;
   }
   return !D->hasDeletedDestructor();
 }
@@ -313,7 +320,6 @@ bool Sema::IsCXXTriviallyRelocatableType(const CXXRecordDecl &RD) {
 }
 
 bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
-
   QualType BaseElementType = getASTContext().getBaseElementType(Type);
 
   if (Type->isVariableArrayType())
@@ -322,10 +328,10 @@ bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
   if (BaseElementType.hasNonTrivialObjCLifetime())
     return false;
 
-  if (BaseElementType.hasAddressDiscriminatedPointerAuth())
+  if (BaseElementType->isIncompleteType())
     return false;
 
-  if (BaseElementType->isIncompleteType())
+  if (Context.containsNonRelocatablePointerAuth(Type))
     return false;
 
   if (BaseElementType->isScalarType() || BaseElementType->isVectorType())
@@ -670,7 +676,10 @@ static bool IsTriviallyRelocatableType(Sema &SemaRef, QualType T) {
   if (!BaseElementType->isObjectType())
     return false;
 
-  if (T.hasAddressDiscriminatedPointerAuth())
+  // The deprecated __builtin_is_trivially_relocatable does not have
+  // an equivalent to __builtin_trivially_relocate, so there is no
+  // safe way to use it if there are any address discriminated values.
+  if (SemaRef.getASTContext().containsAddressDiscriminatedPointerAuth(T))
     return false;
 
   if (const auto *RD = BaseElementType->getAsCXXRecordDecl();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index ab1b5b333e06a..be22ee5221911 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5167,8 +5167,9 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   // FIXME: Clang can't handle the serialization/deserialization of
   // preferred_name properly now. See
   // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A ||
+      (isa<PreferredNameAttr>(A) && (Writer->isWritingStdCXXNamedModules() ||
+                                     Writer->isWritingStdCXXHeaderUnit())))
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
diff --git a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 15d73fb9ca39a..ab90615f63182 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -69,7 +69,7 @@ void DivZeroChecker::reportTaintBug(
     llvm::ArrayRef<SymbolRef> TaintedSyms) const {
   if (!TaintedDivChecker.isEnabled())
     return;
-  if (ExplodedNode *N = C.generateNonFatalErrorNode(StateZero)) {
+  if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
     auto R =
         std::make_unique<PathSensitiveBugReport>(TaintedDivChecker, Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
@@ -113,9 +113,9 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
   if ((stateNotZero && stateZero)) {
     std::vector<SymbolRef> taintedSyms = getTaintedSymbols(C.getState(), *DV);
     if (!taintedSyms.empty()) {
-      reportTaintBug("Division by a tainted value, possibly zero", stateNotZero,
-                     C, taintedSyms);
-      return;
+      reportTaintBug("Division by a tainted value, possibly zero", stateZero, C,
+                     taintedSyms);
+      // Fallthrough to continue analysis in case of non-zero denominator.
     }
   }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index fef33509c0b6e..35e98a5e2719a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1371,6 +1371,20 @@ void MallocChecker::checkIfFreeNameIndex(ProgramStateRef State,
   C.addTransition(State);
 }
 
+const Expr *getPlacementNewBufferArg(const CallExpr *CE,
+                                     const FunctionDecl *FD) {
+  // Checking for signature:
+  // void* operator new  ( std::size_t count, void* ptr );
+  // void* operator new[]( std::size_t count, void* ptr );
+  if (CE->getNumArgs() != 2 || (FD->getOverloadedOperator() != OO_New &&
+                                FD->getOverloadedOperator() != OO_Array_New))
+    return nullptr;
+  auto BuffType = FD->getParamDecl(1)->getType();
+  if (BuffType.isNull() || !BuffType->isVoidPointerType())
+    return nullptr;
+  return CE->getArg(1);
+}
+
 void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
                                            const CallEvent &Call,
                                            CheckerContext &C) const {
@@ -1386,6 +1400,14 @@ void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
   // processed by the checkPostStmt callbacks for CXXNewExpr and
   // CXXDeleteExpr.
   const FunctionDecl *FD = C.getCalleeDecl(CE);
+  if (const auto *BufArg = getPlacementNewBufferArg(CE, FD)) {
+    // Placement new does not allocate memory
+    auto RetVal = State->getSVal(BufArg, Call.getLocationContext());
+    State = State->BindExpr(CE, C.getLocationContext(), RetVal);
+    C.addTransition(State);
+    return;
+  }
+
   switch (FD->getOverloadedOperator()) {
   case OO_New:
     State = MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State,
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 461d01b452fd0..9744d1abf7790 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -81,11 +81,12 @@ enum class ErrorKind : int {
 };
 
 class NullabilityChecker
-    : public Checker<check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
-                     check::PostCall, check::PostStmt<ExplicitCastExpr>,
-                     check::PostObjCMessage, check::DeadSymbols, eval::Assume,
-                     check::Location, check::Event<ImplicitNullDerefEvent>,
-                     check::BeginFunction> {
+    : public CheckerFamily<
+          check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
+          check::PostCall, check::PostStmt<ExplicitCastExpr>,
+          check::PostObjCMessage, check::DeadSymbols, eval::Assume,
+          check::Location, check::Event<ImplicitNullDerefEvent>,
+          check::BeginFunction> {
 
 public:
   // If true, the checker will not diagnose nullabilility issues for calls
@@ -113,25 +114,21 @@ class NullabilityChecker
   void printState(raw_ostream &Out, ProgramStateRef State, const char *NL,
                   const char *Sep) const override;
 
-  enum CheckKind {
-    CK_NullPassedToNonnull,
-    CK_NullReturnedFromNonnull,
-    CK_NullableDereferenced,
-    CK_NullablePassedToNonnull,
-    CK_NullableReturnedFromNonnull,
-    CK_NumCheckKinds
-  };
-
-  bool ChecksEnabled[CK_NumCheckKinds] = {false};
-  CheckerNameRef CheckNames[CK_NumCheckKinds];
-  mutable std::unique_ptr<BugType> BTs[CK_NumCheckKinds];
-
-  const std::unique_ptr<BugType> &getBugType(CheckKind Kind) const {
-    if (!BTs[Kind])
-      BTs[Kind].reset(new BugType(CheckNames[Kind], "Nullability",
-                                  categories::MemoryError));
-    return BTs[Kind];
-  }
+  StringRef getDebugTag() const override { return "NullabilityChecker"; }
+
+  // FIXME: All bug types share the same Description ("Nullability") since the
+  // creation of this checker. We should write more descriptive descriptions...
+  // or just eliminate the Description field if it is meaningless?
+  CheckerFrontendWithBugType NullPassedToNonnull{"Nullability",
+                                                 categories::MemoryError};
+  CheckerFrontendWithBugType NullReturnedFromNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableDereferenced{"Nullability",
+                                                  categories::MemoryError};
+  CheckerFrontendWithBugType NullablePassedToNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableReturnedFromNonnull{
+      "Nullability", categories::MemoryError};
 
   // When set to false no nullability information will be tracked in
   // NullabilityMap. It is possible to catch errors like passing a null pointer
@@ -164,17 +161,16 @@ class NullabilityChecker
   ///
   /// When \p SuppressPath is set to true, no more bugs will be reported on this
   /// path by this checker.
-  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error, CheckKind CK,
-                                 ExplodedNode *N, const MemRegion *Region,
-                                 CheckerContext &C,
+  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error,
+                                 const BugType &BT, ExplodedNode *N,
+                                 const MemRegion *Region, CheckerContext &C,
                                  const Stmt *ValueExpr = nullptr,
                                  bool SuppressPath = false) const;
 
-  void reportBug(StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
-                 const MemRegion *Region, BugReporter &BR,
+  void reportBug(StringRef Msg, ErrorKind Error, const BugType &BT,
+                 ExplodedNode *N, const MemRegion *Region, BugReporter &BR,
                  const Stmt *ValueExpr = nullptr) const {
-    const std::unique_ptr<BugType> &BT = getBugType(CK);
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     if (Region) {
       R->markInteresting(Region);
       R->addVisitor<NullabilityBugVisitor>(Region);
@@ -480,7 +476,7 @@ static bool checkInvariantViolation(ProgramStateRef State, ExplodedNode *N,
 }
 
 void NullabilityChecker::reportBugIfInvariantHolds(
-    StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
+    StringRef Msg, ErrorKind Error, const BugType &BT, ExplodedNode *N,
     const MemRegion *Region, CheckerContext &C, const Stmt *ValueExpr,
     bool SuppressPath) const {
   ProgramStateRef OriginalState = N->getState();
@@ -492,7 +488,7 @@ void NullabilityChecker::reportBugIfInvariantHolds(
     N = C.addTransition(OriginalState, N);
   }
 
-  reportBug(Msg, Error, CK, N, Region, C.getBugReporter(), ValueExpr);
+  reportBug(Msg, Error, BT, N, Region, C.getBugReporter(), ValueExpr);
 }
 
 /// Cleaning up the program state.
@@ -546,19 +542,19 @@ void NullabilityChecker::checkEvent(ImplicitNullDerefEvent Event) const {
   if (!TrackedNullability)
     return;
 
-  if (ChecksEnabled[CK_NullableDereferenced] &&
+  if (NullableDereferenced.isEnabled() &&
       TrackedNullability->getValue() == Nullability::Nullable) {
     BugReporter &BR = *Event.BR;
     // Do not suppress errors on defensive code paths, because dereferencing
     // a nullable pointer is always an error.
     if (Event.IsDirectDereference)
       reportBug("Nullable pointer is dereferenced",
-                ErrorKind::NullableDereferenced, CK_NullableDereferenced,
+                ErrorKind::NullableDereferenced, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     else {
       reportBug("Nullable pointer is passed to a callee that requires a "
                 "non-null",
-                ErrorKind::NullablePassedToNonnull, CK_NullableDereferenced,
+                ErrorKind::NullablePassedToNonnull, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     }
   }
@@ -710,29 +706,28 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
   Nullability RetExprTypeLevelNullability =
         getNullabilityAnnotation(lookThroughImplicitCasts(RetExpr)->getType());
 
-  bool NullReturnedFromNonNull = (RequiredNullability == Nullability::Nonnull &&
-                                  Nullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullReturnedFromNonnull] && NullReturnedFromNonNull &&
-      RetExprTypeLevelNullability != Nullability::Nonnull &&
-      !InSuppressedMethodFamily) {
-    ExplodedNode *N = C.generateErrorNode(State);
-    if (!N)
-      return;
+  if (RequiredNullability == Nullability::Nonnull &&
+      Nullness == NullConstraint::IsNull) {
+    if (NullReturnedFromNonnull.isEnabled() &&
+        RetExprTypeLevelNullability != Nullability::Nonnull &&
+        !InSuppressedMethodFamily) {
+      ExplodedNode *N = C.generateErrorNode(State);
+      if (!N)
+        return;
 
-    SmallString<256> SBuf;
-    llvm::raw_svector_ostream OS(SBuf);
-    OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
-    OS << " returned from a " << C.getDeclDescription(D) <<
-          " that is expected to return a non-null value";
-    reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
-                              CK_NullReturnedFromNonnull, N, nullptr, C,
-                              RetExpr);
-    return;
-  }
+      SmallString<256> SBuf;
+      llvm::raw_svector_ostream OS(SBuf);
+      OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
+      OS << " returned from a " << C.getDeclDescription(D)
+         << " that is expected to return a non-null value";
+      reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
+                                NullReturnedFromNonnull, N, nullptr, C,
+                                RetExpr);
+      return;
+    }
 
-  // If null was returned from a non-null function, mark the nullability
-  // invariant as violated even if the diagnostic was suppressed.
-  if (NullReturnedFromNonNull) {
+    // If null was returned from a non-null function, mark the nullability
+    // invariant as violated even if the diagnostic was suppressed.
     State = State->set<InvariantViolated>(true);
     C.addTransition(State);
     return;
@@ -746,7 +741,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
       State->get<NullabilityMap>(Region);
   if (TrackedNullability) {
     Nullability TrackedNullabValue = TrackedNullability->getValue();
-    if (ChecksEnabled[CK_NullableReturnedFromNonnull] &&
+    if (NullableReturnedFromNonnull.isEnabled() &&
         Nullness != NullConstraint::IsNotNull &&
         TrackedNullabValue == Nullability::Nullable &&
         RequiredNullability == Nullability::Nonnull) {
@@ -758,7 +753,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
             " that is expected to return a non-null value";
 
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NullableReturnedToNonnull,
-                                CK_NullableReturnedFromNonnull, N, Region, C);
+                                NullableReturnedFromNonnull, N, Region, C);
     }
     return;
   }
@@ -809,8 +804,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
 
     unsigned ParamIdx = Param->getFunctionScopeIndex() + 1;
 
-    if (ChecksEnabled[CK_NullPassedToNonnull] &&
-        Nullness == NullConstraint::IsNull &&
+    if (NullPassedToNonnull.isEnabled() && Nullness == NullConstraint::IsNull &&
         ArgExprTypeLevelNullability != Nullability::Nonnull &&
         RequiredNullability == Nullability::Nonnull &&
         isDiagnosableCall(Call)) {
@@ -824,7 +818,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
       OS << " passed to a callee that requires a non-null " << ParamIdx
          << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NilPassedToNonnull,
-                                CK_NullPassedToNonnull, N, nullptr, C, ArgExpr,
+                                NullPassedToNonnull, N, nullptr, C, ArgExpr,
                                 /*SuppressPath=*/false);
       return;
     }
@@ -841,7 +835,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
           TrackedNullability->getValue() != Nullability::Nullable)
         continue;
 
-      if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+      if (NullablePassedToNonnull.isEnabled() &&
           RequiredNullability == Nullability::Nonnull &&
           isDiagnosableCall(Call)) {
         ExplodedNode *N = C.addTransition(State);
@@ -850,17 +844,16 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
         OS << "Nullable pointer is passed to a callee that requires a non-null "
            << ParamIdx << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
         reportBugIfInvariantHolds(OS.str(), ErrorKind::NullablePassedToNonnull,
-                                  CK_NullablePassedToNonnull, N, Region, C,
+                                  NullablePassedToNonnull, N, Region, C,
                                   ArgExpr, /*SuppressPath=*/true);
         return;
       }
-      if (ChecksEnabled[CK_NullableDereferenced] &&
+      if (NullableDereferenced.isEnabled() &&
           Param->getType()->isReferenceType()) {
         ExplodedNode *N = C.addTransition(State);
-        reportBugIfInvariantHolds("Nullable pointer is dereferenced",
-                                  ErrorKind::NullableDereferenced,
-                                  CK_NullableDereferenced, N, Region, C,
-                                  ArgExpr, /*SuppressPath=*/true);
+        reportBugIfInvariantHolds(
+            "Nullable pointer is dereferenced", ErrorKind::NullableDereferenced,
+            NullableDereferenced, N, Region, C, ArgExpr, /*SuppressPath=*/true);
         return;
       }
       continue;
@@ -1294,7 +1287,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   bool NullAssignedToNonNull = (LocNullability == Nullability::Nonnull &&
                                 RhsNullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullPassedToNonnull] && NullAssignedToNonNull &&
+  if (NullPassedToNonnull.isEnabled() && NullAssignedToNonNull &&
       ValNullability != Nullability::Nonnull &&
       ValueExprTypeLevelNullability != Nullability::Nonnull &&
       !isARCNilInitializedLocal(C, S)) {
@@ -1312,7 +1305,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     OS << (LocType->isObjCObjectPointerType() ? "nil" : "Null");
     OS << " assigned to a pointer which is expected to have non-null value";
     reportBugIfInvariantHolds(OS.str(), ErrorKind::NilAssignedToNonnull,
-                              CK_NullPassedToNonnull, N, nullptr, C, ValueStmt);
+                              NullPassedToNonnull, N, nullptr, C, ValueStmt);
     return;
   }
 
@@ -1338,13 +1331,13 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (RhsNullness == NullConstraint::IsNotNull ||
         TrackedNullability->getValue() != Nullability::Nullable)
       return;
-    if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+    if (NullablePassedToNonnull.isEnabled() &&
         LocNullability == Nullability::Nonnull) {
       ExplodedNode *N = C.addTransition(State, C.getPredecessor());
       reportBugIfInvariantHolds("Nullable pointer is assigned to a pointer "
                                 "which is expected to have non-null value",
                                 ErrorKind::NullableAssignedToNonnull,
-                                CK_NullablePassedToNonnull, N, ValueRegion, C);
+                                NullablePassedToNonnull, N, ValueRegion, C);
     }
     return;
   }
@@ -1391,28 +1384,26 @@ void NullabilityChecker::printState(raw_ostream &Out, ProgramStateRef State,
   }
 }
 
-void ento::registerNullabilityBase(CheckerManager &mgr) {
-  mgr.registerChecker<NullabilityChecker>();
-}
-
-bool ento::shouldRegisterNullabilityBase(const CheckerManager &mgr) {
-  return true;
-}
-
-#define REGISTER_CHECKER(name, trackingRequired)                               \
-  void ento::register##name##Checker(CheckerManager &mgr) {                    \
-    NullabilityChecker *checker = mgr.getChecker<NullabilityChecker>();        \
-    checker->ChecksEnabled[NullabilityChecker::CK_##name] = true;              \
-    checker->CheckNames[NullabilityChecker::CK_##name] =                       \
-        mgr.getCurrentCheckerName();                                           \
-    checker->NeedTracking = checker->NeedTracking || trackingRequired;         \
-    checker->NoDiagnoseCallsToSystemHeaders =                                  \
-        checker->NoDiagnoseCallsToSystemHeaders ||                             \
-        mgr.getAnalyzerOptions().getCheckerBooleanOption(                      \
-            checker, "NoDiagnoseCallsToSystemHeaders", true);                  \
+// The checker group "nullability" (which consists of the checkers that are
+// implemented in this file) has a group-level configuration option which
+// affects all the checkers in the group. As this is a completely unique
+// remnant of old design (this is the only group option in the analyzer), there
+// is no machinery to inject the group name from `Checkers.td`, so it is simply
+// hardcoded here:
+constexpr llvm::StringLiteral GroupName = "nullability";
+constexpr llvm::StringLiteral GroupOptName = "NoDiagnoseCallsToSystemHeaders";
+
+#define REGISTER_CHECKER(NAME, TRACKING_REQUIRED)                              \
+  void ento::register##NAME##Checker(CheckerManager &Mgr) {                    \
+    NullabilityChecker *Chk = Mgr.getChecker<NullabilityChecker>();            \
+    Chk->NAME.enable(Mgr);                                                     \
+    Chk->NeedTracking = Chk->NeedTracking || TRACKING_REQUIRED;                \
+    Chk->NoDiagnoseCallsToSystemHeaders =                                      \
+        Mgr.getAnalyzerOptions().getCheckerBooleanOption(GroupName,            \
+                                                         GroupOptName, true);  \
   }                                                                            \
                                                                                \
-  bool ento::shouldRegister##name##Checker(const CheckerManager &mgr) {        \
+  bool ento::shouldRegister##NAME##Checker(const CheckerManager &) {           \
     return true;                                                               \
   }
 
diff --git a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
index 836fc375809ad..f965bfb590d80 100644
--- a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
@@ -92,7 +92,7 @@ void Z3CrosscheckVisitor::finalizeVisitor(BugReporterContext &BRC,
   };
 
   auto AttemptOnce = [&](const llvm::SMTSolverRef &Solver) -> Z3Result {
-    constexpr auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
+    auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
     unsigned InitialRLimit = GetUsedRLimit(Solver);
     double Start = getCurrentTime(/*Start=*/true).getWallTime();
     std::optional<bool> IsSAT = Solver->check();
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index 06754f669b1e6..da0eef7c52bd8 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -26,9 +26,10 @@
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
 // RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
-// RUN:   -verify=expected,leak \
+// RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
+// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
+// RUN:   -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -63,6 +64,39 @@ void testGlobalNoThrowPlacementExprNewBeforeOverload() {
   int *p = new(std::nothrow) int;
 } // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
+//----- Standard pointer placement operators
+void testGlobalPointerPlacementNew() {
+  int i;
+  void *p1 = operator new(0, &i); // no leak: placement new never allocates
+  void *p2 = operator new[](0, &i); // no leak
+  int *p3 = new(&i) int; // no leak
+  int *p4 = new(&i) int[0]; // no leak
+}
+
+template<typename T>
+void clang_analyzer_dump(T x);
+
+void testPlacementNewBufValue() {
+  int i = 10;
+  int *p = new(&i) int;
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementNewBufValueExplicitOp() {
+  int i = 10;
+  int *p = (int*)operator new(sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementArrNewBufValueExplicitArrOp() {
+  int i = 10;
+  int *p = (int*)operator new[](sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
 //----- Other cases
 void testNewMemoryIsInHeap() {
   int *p = new int;
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 66b9be9795f12..78ee00deea18d 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -34,7 +34,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker
diff --git a/clang/test/Analysis/bugfix-124477.m b/clang/test/Analysis/bugfix-124477.m
index 80820f4c93444..8bb0196b2f9b8 100644
--- a/clang/test/Analysis/bugfix-124477.m
+++ b/clang/test/Analysis/bugfix-124477.m
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced,nullability.NullabilityBase -x objective-c %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced -x objective-c %s
 /*
   This test is reduced from a static analyzer crash. The bug causing
   the crash is explained in #124477.  It can only be triggered in some
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 8c6078a49c231..7f9c9ff4c9fd7 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -42,7 +42,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 3c520612c5d9b..9d6d2942df4a9 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -412,6 +412,19 @@ int testTaintedDivFP(void) {
   return 5/x; // x cannot be 0, so no tainted warning either
 }
 
+void clang_analyzer_warnIfReached();
+
+int testTaintDivZeroNonfatal() {
+  int x;
+  scanf("%d", &x);
+  int y = 5/x; // expected-warning {{Division by a tainted value, possibly zero}}
+  if (x == 0)
+    clang_analyzer_warnIfReached();
+  else
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+  return y;
+}
+
 // Zero-sized VLAs.
 void testTaintedVLASize(void) {
   int x;
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index 2706ea7f8f857..322c13c8f081a 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -76,3 +76,21 @@ float constant_fp_builtin_single() {
 // OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev()
 // OGCG: ret float 0x3FB99999A0000000
 // OGCG: }
+
+void library_builtins() {
+  __builtin_printf(nullptr);
+  __builtin_abort();
+}
+
+// CIR: cir.func @_Z16library_builtinsv() {
+// CIR: %[[NULL:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR: cir.call @printf(%[[NULL]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR: cir.call @abort() : () -> ()
+
+// LLVM: define void @_Z16library_builtinsv()
+// LLVM: call i32 (ptr, ...) @printf(ptr null)
+// LLVM: call void @abort()
+
+// OGCG: define dso_local void @_Z16library_builtinsv()
+// OGCG: call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG: call void @abort()
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
new file mode 100644
index 0000000000000..366e474c2b09a
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// CIR: cir.global "private" cir_private dsolocal @".str" = #cir.const_array<"%s\00" : !cir.array<!s8i x 3>> : !cir.array<!s8i x 3> 
+// CIR: cir.global "private" cir_private dsolocal @".str.1" = #cir.const_array<"%s %d\0A\00" : !cir.array<!s8i x 7>> : !cir.array<!s8i x 7>
+// LLVM: @.str = private global [3 x i8] c"%s\00"
+// LLVM: @.str.1 = private global [7 x i8] c"%s %d\0A\00"
+// OGCG: @.str = private unnamed_addr constant [3 x i8] c"%s\00"
+// OGCG: @.str.1 = private unnamed_addr constant [7 x i8] c"%s %d\0A\00"
+
+void func(char const * const str, int i) {
+  __builtin_printf(nullptr);
+  __builtin_printf("%s", str);
+  __builtin_printf("%s %d\n", str, i);
+}
+
+// CIR: cir.func @printf(!cir.ptr<!s8i>, ...) -> !s32i
+
+// CIR: cir.func @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
+// CIR:   %[[str_ptr:.+]] = cir.alloca !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>, ["str", init, const]
+// CIR:   %[[i_ptr:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+// CIR:   cir.store %[[arg0]], %[[str_ptr]] : !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>
+// CIR:   cir.store %[[arg1]], %[[i_ptr]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[null_ptr:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR:   %[[printf_result1:.+]] = cir.call @printf(%[[null_ptr]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[str_fmt_global:.+]] = cir.get_global @".str" : !cir.ptr<!cir.array<!s8i x 3>>
+// CIR:   %[[str_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[str_fmt_global]] : !cir.ptr<!cir.array<!s8i x 3>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[printf_result2:.+]] = cir.call @printf(%[[str_fmt_ptr]], %[[str_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[full_fmt_global:.+]] = cir.get_global @".str.1" : !cir.ptr<!cir.array<!s8i x 7>>
+// CIR:   %[[full_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[full_fmt_global]] : !cir.ptr<!cir.array<!s8i x 7>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val2:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[i_val:.+]] = cir.load{{.*}} %[[i_ptr]] : !cir.ptr<!s32i>, !s32i
+// CIR:   %[[printf_result3:.+]] = cir.call @printf(%[[full_fmt_ptr]], %[[str_val2]], %[[i_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>, !s32i) -> !s32i
+// CIR:   cir.return
+
+// LLVM: define void @_Z4funcPKci(ptr %[[arg0:.+]], i32 %[[arg1:.+]])
+// LLVM:   %[[str_ptr:.+]] = alloca ptr
+// LLVM:   %[[i_ptr:.+]] = alloca i32
+// LLVM:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// LLVM:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr null)
+// LLVM:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr @.str, ptr %[[str_val]])
+// LLVM:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr @.str.1, ptr %[[str_val2]], i32 %[[i_val]])
+// LLVM:   ret void
+
+// OGCG: define dso_local void @_Z4funcPKci(ptr noundef %[[arg0:.+]], i32 noundef %[[arg1:.+]])
+// OGCG:   %[[str_ptr:.+]] = alloca ptr
+// OGCG:   %[[i_ptr:.+]] = alloca i32
+// OGCG:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// OGCG:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str, ptr noundef %[[str_val]])
+// OGCG:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str.1, ptr noundef %[[str_val2]], i32 noundef %[[i_val]])
+// OGCG:   ret void
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index d193b9f32efbc..db0b9111ab4fb 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -176,3 +176,32 @@ void foo7() {
 // OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
 // OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
 
+void foo8() {
+  double _Complex c = 2.00i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<2.000000e+00> : !cir.double> : !cir.complex<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: store { double, double } { double 0.000000e+00, double 2.000000e+00 }, ptr %[[COMPLEX]], align 8
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store double 0.000000e+00, ptr %[[C_REAL_PTR]], align 8
+// OGCG: store double 2.000000e+00, ptr %[[C_IMAG_PTR]], align 8
+
+void foo14() {
+  int _Complex c = 2i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 0, i32 2 }, ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/string-literals.c b/clang/test/CIR/CodeGen/string-literals.c
index 00f59b09400c8..90ea21906f363 100644
--- a/clang/test/CIR/CodeGen/string-literals.c
+++ b/clang/test/CIR/CodeGen/string-literals.c
@@ -5,6 +5,18 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
 // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
 
+char g_str[] = "1234";
+
+// CIR: cir.global external @g_str = #cir.const_array<"1234\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+char g_oversized[100] = "123";
+
+// CIR: cir.global external @g_oversized = #cir.const_array<"123" : !cir.array<!s8i x 3>, trailing_zeros> : !cir.array<!s8i x 100>
+
+char g_exact[4] = "123";
+
+// CIR: cir.global external @g_exact = #cir.const_array<"123\00" : !cir.array<!s8i x 4>> : !cir.array<!s8i x 4>
+
 // CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"1\00" : !cir.array<!s8i x 2>> : !cir.array<!s8i x 2>
 // CIR: cir.global "private" cir_private dsolocal @[[STR2_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 1>
 // CIR: cir.global "private" cir_private dsolocal @[[STR3_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 2>
diff --git a/clang/test/CIR/CodeGen/string-literals.cpp b/clang/test/CIR/CodeGen/string-literals.cpp
new file mode 100644
index 0000000000000..c56eb74387329
--- /dev/null
+++ b/clang/test/CIR/CodeGen/string-literals.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+// CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"abcd\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+// LLVM: @[[STR1_GLOBAL:.*]] = private global [5 x i8] c"abcd\00"
+
+// OGCG: @[[STR1_GLOBAL:.*]] = private unnamed_addr constant [5 x i8] c"abcd\00"
+
+decltype(auto) returns_literal() {
+    return "abcd";
+}
+
+// CIR: cir.func{{.*}} @_Z15returns_literalv() -> !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   %[[RET_ADDR:.*]] = cir.alloca !cir.ptr<!cir.array<!s8i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s8i x 5>>>, ["__retval"]
+// CIR:   %[[STR_ADDR:.*]] = cir.get_global @[[STR1_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   cir.store{{.*}} %[[STR_ADDR]], %[[RET_ADDR]]
+// CIR:   %[[RET:.*]] = cir.load %[[RET_ADDR]]
+// CIR:   cir.return %[[RET]]
diff --git a/clang/test/CIR/Transforms/vector-cmp-fold.cir b/clang/test/CIR/Transforms/vector-cmp-fold.cir
new file mode 100644
index 0000000000000..b207fc08748e2
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-cmp-fold.cir
@@ -0,0 +1,227 @@
+// RUN: cir-opt %s -cir-canonicalize -o - -split-input-file | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ge, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
diff --git a/clang/test/CodeGen/aarch64-always-inline-feature-bug.c b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
new file mode 100644
index 0000000000000..27c3983c66d2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple aarch64-- -target-feature +neon -target-feature +sve\
+// RUN:   -target-feature -sve -emit-llvm %s -o - | FileCheck %s
+
+// Reproducer for bug where clang would reject always_inline for unrelated
+// target features if they were disable with `-feature` on the command line.
+// CHECK: @bar
+__attribute__((always_inline)) __attribute__((target("neon"))) void foo() {}
+void bar() { foo(); }
diff --git a/clang/test/CodeGen/epilog-unwind.c b/clang/test/CodeGen/epilog-unwind.c
index 991ff09fb37cf..b2f7497b455b6 100644
--- a/clang/test/CodeGen/epilog-unwind.c
+++ b/clang/test/CodeGen/epilog-unwind.c
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
-// RUN: %clang_cc1 -fwinx64-eh-unwindv2 -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fwinx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fno-winx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=disabled -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=best-effort -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=required -emit-llvm %s -o - | FileCheck %s -check-prefix=REQUIRED
+// RUN: %clang -fwinx64-eh-unwindv2=best-effort -S -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
 
 void f(void) {}
 
-// ENABLED: !"winx64-eh-unwindv2", i32 1}
+// BESTEFFORT: !"winx64-eh-unwindv2", i32 1}
+// REQUIRED: !"winx64-eh-unwindv2", i32 2}
 // DISABLED-NOT: "winx64-eh-unwindv2"
diff --git a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
index c3204570d6ef3..aaa486eff10b7 100644
--- a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
@@ -7,10 +7,10 @@ struct S {
 
 // CHECK: [[CBLayout:%.*]] = type <{ [2 x float], [2 x <4 x i32>], [2 x [2 x i32]], [1 x target("dx.Layout", %S, 8, 0, 4)] }>
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", [[CBLayout]], 136, 0, 32, 64, 128))
-// CHECK: @c1 = external addrspace(2) global [2 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <4 x i32>], align 16
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x i32]], align 4
-// CHECK: @c4 = external addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
+// CHECK: @c1 = external hidden addrspace(2) global [2 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <4 x i32>], align 16
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x i32]], align 4
+// CHECK: @c4 = external hidden addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
 
 cbuffer CBArrays : register(b0) {
   float c1[2];
@@ -19,7 +19,7 @@ cbuffer CBArrays : register(b0) {
   S c4[1];
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign1
+// CHECK-LABEL: define hidden void {{.*}}arr_assign1
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -33,7 +33,7 @@ void arr_assign1() {
   Arr = Arr2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign2
+// CHECK-LABEL: define hidden void {{.*}}arr_assign2
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -51,7 +51,7 @@ void arr_assign2() {
   Arr = Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign3
+// CHECK-LABEL: define hidden void {{.*}}arr_assign3
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -65,7 +65,7 @@ void arr_assign3() {
   Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign4
+// CHECK-LABEL: define hidden void {{.*}}arr_assign4
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -81,7 +81,7 @@ void arr_assign4() {
   (Arr = Arr2)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign5
+// CHECK-LABEL: define hidden void {{.*}}arr_assign5
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -101,7 +101,7 @@ void arr_assign5() {
   (Arr = Arr2 = Arr3)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign6
+// CHECK-LABEL: define hidden void {{.*}}arr_assign6
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -118,7 +118,7 @@ void arr_assign6() {
   (Arr = Arr2)[0][0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign7
+// CHECK-LABEL: define hidden void {{.*}}arr_assign7
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -138,7 +138,7 @@ void arr_assign7() {
 
 // Verify you can assign from a cbuffer array
 
-// CHECK-LABEL: define void {{.*}}arr_assign8
+// CHECK-LABEL: define hidden void {{.*}}arr_assign8
 // CHECK: [[C:%.*]] = alloca [2 x float], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c1, i32 8, i1 false)
@@ -148,7 +148,7 @@ void arr_assign8() {
   C = c1;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign9
+// CHECK-LABEL: define hidden void {{.*}}arr_assign9
 // CHECK: [[C:%.*]] = alloca [2 x <4 x i32>], align 16
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[C]], ptr align 16 {{.*}}, i32 32, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 16 [[C]], ptr addrspace(2) align 16 @c2, i32 32, i1 false)
@@ -158,7 +158,7 @@ void arr_assign9() {
   C = c2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign10
+// CHECK-LABEL: define hidden void {{.*}}arr_assign10
 // CHECK: [[C:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c3, i32 16, i1 false)
@@ -168,7 +168,7 @@ void arr_assign10() {
   C = c3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign11
+// CHECK-LABEL: define hidden void {{.*}}arr_assign11
 // CHECK: [[C:%.*]] = alloca [1 x %struct.S], align 1
 // CHECK: call void @llvm.memcpy.p0.p2.i32(ptr align 1 [[C]], ptr addrspace(2) align 1 @c4, i32 8, i1 false)
 // CHECK-NEXT: ret void
diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
index 29ea896045bb1..42a469ae87957 100644
--- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
@@ -3,7 +3,7 @@
 
 void fn(float x[2]) { }
 
-// CHECK-LABEL: define void {{.*}}call{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x float]
 // CHECK: [[Tmp:%.*]] = alloca [2 x float]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 4 [[Arr]], i8 0, i32 8, i1 false)
@@ -21,7 +21,7 @@ struct Obj {
 
 void fn2(Obj O[4]) { }
 
-// CHECK-LABEL: define void {{.*}}call2{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call2{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: [[Tmp:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 1 [[Arr]], i8 0, i32 32, i1 false)
@@ -35,7 +35,7 @@ void call2() {
 
 void fn3(float x[2][2]) { }
 
-// CHECK-LABEL: define void {{.*}}call3{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call3{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x float]]
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr]], ptr align 4 {{.*}}, i32 16, i1 false)
@@ -46,7 +46,7 @@ void call3() {
   fn3(Arr);
 }
 
-// CHECK-LABEL: define void {{.*}}call4{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}call4{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x [2 x float]]) align 4 [[Arr:%.*]])
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[Arr]], i32 16, i1 false)
@@ -59,7 +59,7 @@ void call4(float Arr[2][2]) {
 // Verify that each template instantiation codegens to a unique and correctly
 // mangled function name.
 
-// CHECK-LABEL: define void {{.*}}template_call{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}template_call{{.*}}(ptr
 
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]],
 // CHECK-SAME: ptr noundef byval([4 x float]) align 4 [[FA4:%[0-9A-Z]+]],
@@ -86,7 +86,7 @@ void template_call(float FA2[2], float FA4[4], int IA3[3]) {
 
 
 // Verify that Array parameter element access correctly codegens.
-// CHECK-LABEL: define void {{.*}}element_access{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}element_access{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]]
 
 // CHECK: [[Addr:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
index eb7d755bca61d..bccfaf597f0ed 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
@@ -11,7 +11,7 @@ void increment(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -32,7 +32,7 @@ void fn2(out int Arr[2]) {
 // CHECK: [[A:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -56,7 +56,7 @@ void nestedCall(inout int Arr[2], uint index) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0) #3
+// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 1
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -70,7 +70,7 @@ export int arrayCall3() {
 // CHECK-LABEL: outerCall
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 %{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: ret void
 void outerCall(inout int Arr[2]) {
@@ -82,7 +82,7 @@ void outerCall(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -99,7 +99,7 @@ void fn3(int Arr[2]) {}
 // CHECK-LABEL: outerCall2
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 {{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: ret void
 void outerCall2(inout int Arr[2]) {
   fn3(Arr);
@@ -110,7 +110,7 @@ void outerCall2(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
index 371f31c9e4afc..c30c640519cda 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
@@ -46,7 +46,7 @@ struct SlicyBits {
 };
 
 // Case 1: Extraneous braces get ignored in literal instantiation.
-// CHECK-LABEL: define void @_Z5case1v(
+// CHECK-LABEL: define hidden void @_Z5case1v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case1v.TF1, i32 8, i1 false)
@@ -58,7 +58,7 @@ TwoFloats case1() {
 }
 
 // Case 2: Valid C/C++ initializer is handled appropriately.
-// CHECK-LABEL: define void @_Z5case2v(
+// CHECK-LABEL: define hidden void @_Z5case2v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case2v.TF2, i32 8, i1 false)
@@ -70,7 +70,7 @@ TwoFloats case2() {
 }
 
 // Case 3: Simple initialization with conversion of an argument.
-// CHECK-LABEL: define void @_Z5case3i(
+// CHECK-LABEL: define hidden void @_Z5case3i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[VAL:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -90,7 +90,7 @@ TwoFloats case3(int Val) {
 
 // Case 4: Initialization from a scalarized vector into a structure with element
 // conversions.
-// CHECK-LABEL: define void @_Z5case4Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case4Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -113,7 +113,7 @@ TwoFloats case4(int2 TwoVals) {
 }
 
 // Case 5: Initialization from a scalarized vector of matching type.
-// CHECK-LABEL: define void @_Z5case5Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case5Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -135,7 +135,7 @@ TwoInts case5(int2 TwoVals) {
 
 // Case 6: Initialization from a scalarized structure of different type with
 // different element types.
-// CHECK-LABEL: define void @_Z5case69TwoFloats(
+// CHECK-LABEL: define hidden void @_Z5case69TwoFloats(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -157,7 +157,7 @@ TwoInts case6(TwoFloats TF4) {
 
 // Case 7: Initialization of a complex structure, with bogus braces and element
 // conversions from a collection of scalar values, and structures.
-// CHECK-LABEL: define void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
+// CHECK-LABEL: define hidden void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_DOGGO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI1:%.*]], ptr noundef byval([[STRUCT_TWOINTS]]) align 1 [[TI2:%.*]], i32 noundef [[VAL:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF3:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -221,7 +221,7 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 
 // Case 8: Initialization of a structure from a different structure with
 // significantly different element types and grouping.
-// CHECK-LABEL: define void @_Z5case85Doggo(
+// CHECK-LABEL: define hidden void @_Z5case85Doggo(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ANIMALBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LEGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -307,7 +307,7 @@ AnimalBits case8(Doggo D1) {
 // Case 9: Everything everywhere all at once... Initializing mismatched
 // structures from different layouts, different component groupings, with no
 // top-level bracing separation.
-// CHECK-LABEL: define void @_Z5case95Doggo10AnimalBits(
+// CHECK-LABEL: define hidden void @_Z5case95Doggo10AnimalBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ZOO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]], ptr noundef byval([[STRUCT_ANIMALBITS:%.*]]) align 1 [[A1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ZOO]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -723,7 +723,7 @@ Zoo case9(Doggo D1, AnimalBits A1) {
 }
 
 // Case 10: Initialize an object with a base class from two objects.
-// CHECK-LABEL: define void @_Z6case109TwoFloatsS_(
+// CHECK-LABEL: define hidden void @_Z6case109TwoFloatsS_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -750,7 +750,7 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
 }
 
 // Case 11: Initialize an object with a base class from a vector splat.
-// CHECK-LABEL: define void @_Z6case11f(
+// CHECK-LABEL: define hidden void @_Z6case11f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], float noundef nofpclass(nan inf) [[F:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
@@ -799,7 +799,7 @@ FourFloats case11(float F) {
 }
 
 // Case 12: Initialize bitfield from two integers.
-// CHECK-LABEL: define void @_Z6case12ii(
+// CHECK-LABEL: define hidden void @_Z6case12ii(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
@@ -821,7 +821,7 @@ SlicyBits case12(int I, int J) {
 }
 
 // Case 13: Initialize bitfield from a struct of two ints.
-// CHECK-LABEL: define void @_Z6case137TwoInts(
+// CHECK-LABEL: define hidden void @_Z6case137TwoInts(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
@@ -841,7 +841,7 @@ SlicyBits case13(TwoInts TI) {
 }
 
 // Case 14: Initialize struct of ints from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case149SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case149SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -861,7 +861,7 @@ TwoInts case14(SlicyBits SB) {
 }
 
 // Case 15: Initialize struct of floats from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case159SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case159SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -884,7 +884,7 @@ TwoFloats case15(SlicyBits SB) {
 
 // Case 16: Side-effecting initialization list arguments. The important thing
 // here is that case16 only has _one_ call to makeTwo.
-// CHECK-LABEL: define void @_Z7makeTwoRf(
+// CHECK-LABEL: define hidden void @_Z7makeTwoRf(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
@@ -910,7 +910,7 @@ TwoFloats makeTwo(inout float X) {
     return TF;
 }
 
-// CHECK-LABEL: define void @_Z6case16v(
+// CHECK-LABEL: define hidden void @_Z6case16v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = alloca float, align 4
@@ -948,7 +948,7 @@ int case17Helper(int x) {
 }
 
 // InitList with OpaqueValueExpr
-// CHECK-LABEL: define void {{.*}}case17
+// CHECK-LABEL: define hidden void {{.*}}case17
 // CHECK: [[X:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[C:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 0)
 // CHECK-NEXT: [[C1:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 1)
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
index 1f45a7f9b46d3..d0ba8f447b732 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
@@ -6,7 +6,7 @@
 // integer. It is converted to an integer on call and converted back after the
 // function.
 
-// CHECK: define void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void trunc_Param(inout int X) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) float {{.*}}case1
@@ -32,7 +32,7 @@ export float case1(float F) {
 // uninitialized in the function. If they are not initialized before the
 // function returns the value is undefined.
 
-// CHECK: define void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void undef(out int Z) { }
 
 // ALL-LABEL: define noundef i32 {{.*}}case2
@@ -54,7 +54,7 @@ export int case2() {
 // This test should verify that an out parameter value is written to as
 // expected.
 
-// CHECK: define void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void zero(out int Z) { Z = 0; }
 
 // ALL-LABEL: define noundef i32 {{.*}}case3
@@ -76,7 +76,7 @@ export int case3() {
 // Vector swizzles in HLSL produce lvalues, so they can be used as arguments to
 // inout parameters and the swizzle is reversed on writeback.
 
-// CHECK: define void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void funky(inout int3 X) {
   X.x += 1;
   X.y += 2;
@@ -116,7 +116,7 @@ export int3 case4() {
 
 // Case 5: Straightforward inout of a scalar value.
 
-// CHECK: define void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void increment(inout int I) {
   I += 1;
 }
@@ -144,7 +144,7 @@ struct S {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(out S s) {
   s.X = 3;
   s.Y = 4;
@@ -170,7 +170,7 @@ struct R {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(inout R s) {
   s.X = 3;
   s.Y = 4;
@@ -194,7 +194,7 @@ export int case7() {
 
 // Case 8: Non-scalars with a cast expression.
 
-// CHECK: define void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void trunc_vec(inout int3 V) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}case8
diff --git a/clang/test/CodeGenHLSL/Bool.hlsl b/clang/test/CodeGenHLSL/Bool.hlsl
index fb0f32b11241d..21328c1f9d4df 100644
--- a/clang/test/CodeGenHLSL/Bool.hlsl
+++ b/clang/test/CodeGenHLSL/Bool.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
 // CHECK: [[X:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[Y:%.*]] = zext i1 {{%.*}} to i32
 // CHECK-NEXT: store i32 [[Y]], ptr [[X]], align 4
diff --git a/clang/test/CodeGenHLSL/BoolVector.hlsl b/clang/test/CodeGenHLSL/BoolVector.hlsl
index 35d8b9dac801d..d5054a5a92b5d 100644
--- a/clang/test/CodeGenHLSL/BoolVector.hlsl
+++ b/clang/test/CodeGenHLSL/BoolVector.hlsl
@@ -9,7 +9,7 @@ struct S {
     float f;
 };
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn1{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn1{{.*}}
 // CHECK: [[B:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[B]], align 8
 // CHECK-NEXT: [[BoolVec:%.*]] = load <2 x i32>, ptr [[B]], align 8
@@ -21,7 +21,7 @@ bool fn1() {
   return B[0];
 }
 
-// CHECK-LABEL: define noundef <2 x i1> {{.*}}fn2{{.*}}
+// CHECK-LABEL: define hidden noundef <2 x i1> {{.*}}fn2{{.*}}
 // CHECK: [[VAddr:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[A:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[StoreV:%.*]] = zext i1 {{.*}} to i32
@@ -40,7 +40,7 @@ bool2 fn2(bool V) {
   return A;
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn3{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn3{{.*}}
 // CHECK: [[s:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[s]], ptr align 1 [[ConstS]], i32 12, i1 false)
 // CHECK-NEXT: [[BV:%.*]] = getelementptr inbounds nuw %struct.S, ptr [[s]], i32 0, i32 0
@@ -53,7 +53,7 @@ bool fn3() {
   return s.bv[0];
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn4{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn4{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 [[ConstArr]], i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
@@ -66,7 +66,7 @@ bool fn4() {
   return Arr[0][1];
 }
 
-// CHECK-LABEL: define void {{.*}}fn5{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn5{{.*}}
 // CHECK: [[Arr:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[Arr]], align 8
 // CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[Arr]], align 8
@@ -78,7 +78,7 @@ void fn5() {
   Arr[1] = false;
 }
 
-// CHECK-LABEL: define void {{.*}}fn6{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn6{{.*}}
 // CHECK: [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[S:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: store i32 0, ptr [[V]], align 4
@@ -97,7 +97,7 @@ void fn6() {
   s.bv[1] = V;
 }
 
-// CHECK-LABEL: define void {{.*}}fn7{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn7{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index 9090e9e85ed98..afda714106fac 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -33,7 +33,7 @@ void SecondEntry() {}
 
 // Verify the constructor is alwaysinline
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
-// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
+// NOINLINE-NEXT: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
 
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
 // NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]]
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 362042654ea8c..37fb5195e9768 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -6,38 +6,38 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
-// CHECK: @uint16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @int16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @uint_Val = external addrspace(2) global i32, align 4
-// CHECK: @uint64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @int16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @int16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @uint16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @uint16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @uint16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @int2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @int3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @int4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @uint2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @uint3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @uint4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @int64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @int64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @int64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @uint64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @uint64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @uint64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @half2_Val = external addrspace(2) global <2 x half>, align 4
-// CHECK: @half3_Val = external addrspace(2) global <3 x half>, align 8
-// CHECK: @half4_Val = external addrspace(2) global <4 x half>, align 8
-// CHECK: @float2_Val = external addrspace(2) global <2 x float>, align 8
-// CHECK: @float3_Val = external addrspace(2) global <3 x float>, align 16
-// CHECK: @float4_Val = external addrspace(2) global <4 x float>, align 16
-// CHECK: @double2_Val = external addrspace(2) global <2 x double>, align 16
-// CHECK: @double3_Val = external addrspace(2) global <3 x double>, align 32
-// CHECK: @double4_Val = external addrspace(2) global <4 x double>, align 32
+// CHECK: @uint16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @int16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @uint_Val = external hidden addrspace(2) global i32, align 4
+// CHECK: @uint64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @int16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @int16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @uint16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @uint16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @uint16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @int2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @int3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @int4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @uint2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @uint3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @uint4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @int64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @int64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @int64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @uint64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @uint64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @uint64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @half2_Val = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @half3_Val = external hidden addrspace(2) global <3 x half>, align 8
+// CHECK: @half4_Val = external hidden addrspace(2) global <4 x half>, align 8
+// CHECK: @float2_Val = external hidden addrspace(2) global <2 x float>, align 8
+// CHECK: @float3_Val = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @float4_Val = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK: @double2_Val = external hidden addrspace(2) global <2 x double>, align 16
+// CHECK: @double3_Val = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @double4_Val = external hidden addrspace(2) global <4 x double>, align 32
 
 #ifdef NAMESPACED
 #define TYPE_DECL(T)  hlsl::T T##_Val
diff --git a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
index e1832bdbbf33f..8457ad6da293f 100644
--- a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
@@ -4,7 +4,7 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 
-// CHECK-LABEL: define noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -31,7 +31,7 @@ uint2 test_AddUint64_uint2(uint2 a, uint2 b) {
   return AddUint64(a, b);
 }
 
-// CHECK-LABEL: define noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i32>, align 16
diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
index 403d473ce9680..3a8d2c03e173c 100644
--- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 1, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of ByteAddressBuffer C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME:  %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,27 +47,27 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWByteAddressBuffer C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this1,
 // CHECK-SAME: i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // RasterizerOrderedByteAddressBuffer C1 default constructor
-// CHECK: define void @_Z3foov() #2 {
+// CHECK: define void @_Z3foov()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RasterizerOrderedByteAddressBuffer", align 4
 // CHECK-NEXT: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RasterizerOrderedByteAddressBuffer default C1 constructor that
 // calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 // CHECK-NEXT: ret void
 
 // Buf1 initialization part 3 - ByteAddressBuffer C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -76,7 +76,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWByteAddressBuffer C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -85,7 +85,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of RasterizerOrderedByteAddressBuffer default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", i8, 1, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
index 9d95d54852c0b..114230d38ba54 100644
--- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef -check-prefixes=CHECK,CHECK-DXIL
+// RUN:   -DTARGET=dx -check-prefixes=CHECK,CHECK-DXIL
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef" -check-prefixes=CHECK,CHECK-SPIRV
+// RUN:   -DTARGET=spv -check-prefixes=CHECK,CHECK-SPIRV
 
-// CHECK-DXIL: define void @
-// CHECK-SPIRV: define spir_func void @
+// CHECK-DXIL: define hidden void @
+// CHECK-SPIRV: define hidden spir_func void @
 void test_GroupMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
 // CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index e74a7ed270b01..114468914e2ea 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 5, i32 noundef 3, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of RWBuffer<float> C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,7 +47,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -59,12 +59,12 @@ export void foo() {
 // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RWBuffer<int> default C1 constructor that calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of RWBuffer<float> C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -73,7 +73,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -81,7 +81,7 @@ export void foo() {
 // CHECK-NEXT: store target("dx.TypedBuffer", double, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4
 
 // Buf3 initialization part 3 - body of RWBuffer<int> default C2 constructor that initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0
 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
index 8a3958ad8fd04..7804239edccae 100644
--- a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
@@ -304,7 +304,7 @@ bool2 AccessBools() {
   return X.zw;
 }
 
-// CHECK-LABEL: define void {{.*}}BoolSizeMismatch{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}BoolSizeMismatch{{.*}}
 // CHECK: [[B:%.*]] = alloca <4 x i32>, align 16
 // CHECK-NEXT: [[Tmp:%.*]] = alloca <1 x i32>, align 4
 // CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[B]], align 16
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
index fc7b6be5c9000..28841732df99e 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
@@ -36,7 +36,7 @@ export void foo() {
 
 // Buf1 initialization part 2 - body of StructuredBuffer<float> C1 constructor with explicit binding 
 // that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -49,7 +49,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWStructuredBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -63,12 +63,12 @@ export void foo() {
 
 // Buf3 initialization part 2 - body of AppendStructuredBuffer<float> default C1 constructor that calls
 // the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of AppendStructuredBuffer<float> C2 constructor with explicit binding 
 // that initializes handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
 // CHECK-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -77,7 +77,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWStructuredBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -86,7 +86,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of AppendStructuredBuffer<float> default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index e8a6ee0449571..6abe2f816c844 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -8,16 +8,16 @@
 using hlsl::abs;
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_abs_int16_t
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z16test_abs_int16_t
 // NATIVE_HALF: call i16 @llvm.abs.i16(
 int16_t test_abs_int16_t(int16_t p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_abs_int16_t2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z17test_abs_int16_t2
 // NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16(
 int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_abs_int16_t3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z17test_abs_int16_t3
 // NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16(
 int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_abs_int16_t4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z17test_abs_int16_t4
 // NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16(
 int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); }
 
@@ -50,76 +50,76 @@ uint16_t3 test_abs_uint64_t3(uint16_t3 p0) { return abs(p0); }
 uint16_t4 test_abs_uint64_t4(uint16_t4 p0) { return abs(p0); }
 #endif // __HLSL_ENABLE_16_BIT
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_abs_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_abs_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.fabs.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_abs_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_abs_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(float %0)
 half test_abs_half(half p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.fabs.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 half2 test_abs_half2(half2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.fabs.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 half3 test_abs_half3(half3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.fabs.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 half4 test_abs_half4(half4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i32 @_Z12test_abs_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_abs_int
 // CHECK: call i32 @llvm.abs.i32(
 int test_abs_int(int p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_abs_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_abs_int2
 // CHECK: call <2 x i32> @llvm.abs.v2i32(
 int2 test_abs_int2(int2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_abs_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_abs_int3
 // CHECK: call <3 x i32> @llvm.abs.v3i32(
 int3 test_abs_int3(int3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_abs_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_abs_int4
 // CHECK: call <4 x i32> @llvm.abs.v4i32(
 int4 test_abs_int4(int4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_abs_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_abs_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(
 float test_abs_float(float p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 float2 test_abs_float2(float2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 float3 test_abs_float3(float3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 float4 test_abs_float4(float4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i64 @_Z16test_abs_int64_t
+// CHECK-LABEL: define hidden noundef i64 @_Z16test_abs_int64_t
 // CHECK: call i64 @llvm.abs.i64(
 int64_t test_abs_int64_t(int64_t p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z17test_abs_int64_t2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z17test_abs_int64_t2
 // CHECK: call <2 x i64> @llvm.abs.v2i64(
 int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z17test_abs_int64_t3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z17test_abs_int64_t3
 // CHECK: call <3 x i64> @llvm.abs.v3i64(
 int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z17test_abs_int64_t4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z17test_abs_int64_t4
 // CHECK: call <4 x i64> @llvm.abs.v4i64(
 int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_abs_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_abs_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.fabs.f64(
 double test_abs_double(double p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.fabs.v2f64(
 double2 test_abs_double2(double2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.fabs.v3f64(
 double3 test_abs_double3(double3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.fabs.v4f64(
 double4 test_abs_double4(double4 p0) { return abs(p0); }
 
diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
index 39f364c5953d6..391fad0ef33f5 100644
--- a/clang/test/CodeGenHLSL/builtins/all.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/and.hlsl b/clang/test/CodeGenHLSL/builtins/and.hlsl
index b77889cd9ae70..d2ca7cf4163ed 100644
--- a/clang/test/CodeGenHLSL/builtins/and.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/and.hlsl
@@ -3,7 +3,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 @_Z15test_and_scalarbb(
+// CHECK-LABEL: define hidden noundef i1 @_Z15test_and_scalarbb(
 // CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and i1 [[X]], [[Y]]
@@ -13,7 +13,7 @@ bool test_and_scalar(bool x, bool y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
+// CHECK-LABEL: define hidden noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
 // CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <2 x i1> [[X]], [[Y]]
@@ -23,7 +23,7 @@ bool2 test_and_bool2(bool2 x, bool2 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
+// CHECK-LABEL: define hidden noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
 // CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <3 x i1> [[X]], [[Y]]
@@ -33,7 +33,7 @@ bool3 test_and_bool3(bool3 x, bool3 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
 // CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <4 x i1> [[X]], [[Y]]
@@ -43,7 +43,7 @@ bool4 test_and_bool4(bool4 x, bool4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
 // CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer
@@ -55,7 +55,7 @@ bool4 test_and_int4(int4 x, int4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[X]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
index 3d9d8e9e689ed..e4837876e2693 100644
--- a/clang/test/CodeGenHLSL/builtins/any.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
index b313c99e89a53..bdefe46b802e7 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::ceil;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_double(double p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_double2(double2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_double3(double3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_double4(double4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int(int p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int2(int2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int3(int3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int4(int4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint(uint p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint2(uint2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint3(uint3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint4(uint4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int64_t(int64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int64_t2(int64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int64_t3(int64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int64_t4(int64_t4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint64_t(uint64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint64_t2(uint64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint64_t3(uint64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint64_t4(uint64_t4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index fe0b8f8983838..1a9c630b60e57 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::ceil;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_ceil_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_ceil_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.ceil.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_ceil_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_ceil_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(float %0)
 half test_ceil_half(half p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.ceil.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 half2 test_ceil_half2(half2 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.ceil.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 half3 test_ceil_half3(half3 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.ceil.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 half4 test_ceil_half4(half4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_ceil_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_ceil_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_float(float p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_float2(float2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_float3(float3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_float4(float4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
index c0e1e914831aa..eaedfb419c195 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] <4 x i16> {{.*}}test_clamp_short4_mismatch
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index d01c2a45c43c8..58db4423799be 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i16 @_Z16test_clamp_short
diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
index c864f93af472b..aaeb2f026449b 100644
--- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK:      define void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
+// CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
 // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
 // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
 // CHECK-NO:   call i1 @llvm.dx.any
diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
index 5a1753766a8a1..e067828c38bf6 100644
--- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
@@ -3,13 +3,13 @@
 
 
 void test_scalar(float Buf) {
-  // CHECK:      define void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // CHECK-NO:   call i1 @llvm.dx.any
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[FCMP]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // SPIRV-NO:   call i1 @llvm.spv.any
@@ -21,13 +21,13 @@ void test_scalar(float Buf) {
 }
 
 void test_vector4(float4 Buf) {
-  // CHECK:      define void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // CHECK-NEXT: [[ANYC:%.*]] = call i1 @llvm.dx.any.v4i1(<4 x i1> [[FCMP]])
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[ANYC]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // SPIRV-NEXT: [[ANYC:%.*]] = call i1 @llvm.spv.any.v4i1(<4 x i1> [[FCMP]]) 
diff --git a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
index b7b11b1c3bd6d..70926cc8ba743 100644
--- a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_double(double p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_double2(double2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_double3(double3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_double4(double4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int(int p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int2(int2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int3(int3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int4(int4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint(uint p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint2(uint2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint3(uint3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint4(uint4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int64_t(int64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int64_t2(int64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int64_t3(int64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int64_t4(int64_t4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint64_t(uint64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint64_t2(uint64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint64_t3(uint64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint64_t4(uint64_t4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 5f993d50498bf..79f9e1e6fbec2 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_cos_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_cos_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.cos.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_cos_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_cos_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 half test_cos_half(half p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.cos.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32(
 half2 test_cos_half2(half2 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.cos.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32(
 half3 test_cos_half3(half3 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.cos.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32(
 half4 test_cos_half4(half4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_cos_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_cos_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_float(float p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_float2(float2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_float3(float3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index b2a1d6316787d..89ac383e2517f 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] <3 x half> @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half>
diff --git a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
index bafd2368c9961..a1abf435ea10c 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].degrees.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
index 64531dd2785eb..f0fb12855e5f6 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].degrees.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index ac38cf1853799..0c24fbb9f1859 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -6,14 +6,14 @@
 // RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
@@ -22,7 +22,7 @@
 //
 half test_distance_half(half X, half Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -30,7 +30,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -39,7 +39,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 //
 half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -47,7 +47,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -56,7 +56,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 //
 half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -64,7 +64,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -73,14 +73,14 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 //
 half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
@@ -89,7 +89,7 @@ half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 //
 float test_distance_float(float X, float Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -97,7 +97,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -106,7 +106,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 //
 float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -114,7 +114,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -123,7 +123,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 //
 float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
@@ -131,7 +131,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
diff --git a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
index 858a1210169d2..df34beeba7a8c 100644
--- a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_double
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_double(double p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_double2(double2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_double3(double3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_double4(double4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int(int p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int2(int2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int3(int3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int4(int4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint(uint p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint2(uint2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint3(uint3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint4(uint4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int64_t(int64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int64_t2(int64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int64_t3(int64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int64_t4(int64_t4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint64_t(uint64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint64_t2(uint64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint64_t3(uint64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint64_t4(uint64_t4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 6ed40ed8f433c..5a8f60528a84c 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_exp_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_exp_half
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(
 // NATIVE_HALF: ret half %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_exp_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_exp_half
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // NO_HALF: ret float %elt.exp
 half test_exp_half(half p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp
 half2 test_exp_half2(half2 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp
 half3 test_exp_half3(half3 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp
 half4 test_exp_half4(half4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp_float
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_float(float p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_float2(float2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_float3(float3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_float4(float4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
index ef522afc244a8..20482777a18de 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_double
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_double(double p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_double2(double2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_double3(double3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_double4(double4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int(int p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int2(int2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int3(int3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int4(int4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint(uint p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint2(uint2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint3(uint3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint4(uint4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int64_t(int64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int64_t2(int64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int64_t3(int64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int64_t4(int64_t4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint64_t(uint64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint64_t2(uint64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint64_t3(uint64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint64_t4(uint64_t4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index b067427e46368..a9bbcb0d9bff9 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_exp2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_exp2_half
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(
 // NATIVE_HALF: ret half %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp2_half
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // NO_HALF: ret float %elt.exp2
 half test_exp2_half(half p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp2
 half2 test_exp2_half2(half2 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp2
 half3 test_exp2_half3(half3 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp2
 half4 test_exp2_half4(half4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_exp2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_exp2_float
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_float(float p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_float2(float2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_float3(float3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_float4(float4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index debf6b6d3e3f5..a71b1878f8b55 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbithigh_long3(int64_t3 p0) {
 uint4 test_firstbithigh_long4(int64_t4 p0) {
   return firstbithigh(p0);
 }
+
+// CHECK-LABEL: test_firstbithigh_upcast
+// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbithigh_upcast(uint4 p0) {
+  return firstbithigh(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
index 5d490fabc5bc8..007db0c9c2ad5 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbitlow_long3(int64_t3 p0) {
 uint4 test_firstbitlow_long4(int64_t4 p0) {
   return firstbitlow(p0);
 }
+
+// CHECK-LABEL: test_firstbitlow_upcast
+// CHECK: [[FBL:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBL]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbitlow_upcast(uint4 p0) {
+  return firstbitlow(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
index 26d83443ea489..1e413e53f333e 100644
--- a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::floor;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_double(double p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_double2(double2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_double3(double3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_double4(double4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int(int p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int2(int2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int3(int3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int4(int4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint(uint p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint2(uint2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint3(uint3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint4(uint4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int64_t(int64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int64_t2(int64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int64_t3(int64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int64_t4(int64_t4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint64_t(uint64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint64_t2(uint64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint64_t3(uint64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint64_t4(uint64_t4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index f610baeeefd48..b3ff58317981a 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::floor;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_floor_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_floor_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.floor.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_floor_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_floor_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(float %0)
 half test_floor_half(half p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.floor.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 half2 test_floor_half2(half2 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.floor.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 half3 test_floor_half3(half3 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.floor.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 half4 test_floor_half4(half4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_floor_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_floor_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_float(float p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_float2(float2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_float3(float3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_float4(float4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
index 7ecc5854b3988..cc91c0b67f6cc 100644
--- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
 
 //
@@ -12,7 +12,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm \
-// RUN:   -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=float -DINT_TYPE=f32 --check-prefixes=DXCHECK
 
 
@@ -23,7 +23,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=half
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
 
 //
 // ---------- No Native Half support test -----------
@@ -31,7 +31,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm \
 // RUN:   -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=float
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=float
 
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
index b0e844bd8a8d8..7a3f7b0069480 100644
--- a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.frac = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].frac.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index 7b105ce84359f..d8397407cd013 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.frac = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].frac.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
index 6d2ae6535ecb3..24114b11c7602 100644
--- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
@@ -6,9 +6,9 @@ using handle_float_t = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::c
 // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", %struct.MyStruct, 0, 0)
 // CHECK: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
 
-// CHECK: define void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 // CHECK: call void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %0)
-// CHECK: declare void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
+// CHECK: declare hidden void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
 
 void foo1(handle_float_t res);
 
@@ -16,14 +16,14 @@ void fa(handle_float_t a) {
     foo1(a);
 }
 
-// CHECK: define void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 void fb(handle_float_t a) {
     handle_float_t b = a;
 }
 
-// CHECK: define void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
 // CHECK: call void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
 void foo2(RWBuffer<float4> buf);
 
 void fc(RWBuffer<float4> a) {
@@ -39,9 +39,9 @@ struct MyStruct {
   int2 i;
 };
 
-// CHECK: define void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
 // CHECK: call void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
 void foo3(StructuredBuffer<MyStruct> buf);
 
 void fe(StructuredBuffer<MyStruct> a) {
diff --git a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
index ace209003ce43..f39cba9ace6e3 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
@@ -2,19 +2,19 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_double(double p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_double2(double2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_double3(double3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_double4(double4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
index df44fc4a91dfd..4d53daaafb692 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -6,40 +6,40 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // NATIVE_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f16(
 // NO_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_half(half p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // NATIVE_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f16
 // NO_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32(
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_half2(half2 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <3 x i1> @
+// NATIVE_HALF: define hidden noundef <3 x i1> @
 // NATIVE_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f16
 // NO_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32(
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_half3(half3 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <4 x i1> @
+// NATIVE_HALF: define hidden noundef <4 x i1> @
 // NATIVE_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f16
 // NO_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32(
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_half4(half4 p0) { return isinf(p0); }
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_float(float p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_float2(float2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_float3(float3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_float4(float4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index ea0d1348c6e4e..f8fa06c39f2a1 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,48 +1,48 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn half %elt.exp2, %{{.*}}
 // CHECK: ret half %mul
 half test_ldexp_half(half X, half Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16(<2 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x half> %mul
 half2 test_ldexp_half2(half2 X, half2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16(<3 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x half> %mul
 half3 test_ldexp_half3(half3 X, half3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16(<4 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x half> %mul
 half4 test_ldexp_half4(half4 X, half4 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(float %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn float %elt.exp2, %{{.*}}
 // CHECK: ret float %mul
 float test_ldexp_float(float X, float Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(<2 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x float> %mul
 float2 test_ldexp_float2(float2 X, float2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(<3 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x float> %mul
 float3 test_ldexp_float3(float3 X, float3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(<4 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x float> %mul
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 0b17d03d7097d..9297c35abfd16 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -8,16 +8,13 @@
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
-//
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
@@ -28,18 +25,14 @@ half test_length_half(half p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
-//
-
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[P0]], <2 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[P0]])
@@ -50,15 +43,14 @@ half test_length_half2(half2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[P0]], <3 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[P0]])
@@ -69,15 +61,14 @@ half test_length_half3(half3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[P0]], <4 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[P0]])
@@ -88,14 +79,13 @@ half test_length_half4(half4 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
@@ -106,15 +96,14 @@ float test_length_float(float p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[P0]], <2 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[P0]])
@@ -125,15 +114,14 @@ float test_length_float2(float2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[P0]], <3 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[P0]])
@@ -144,15 +132,14 @@ float test_length_float3(float3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[P0]], <4 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[P0]])
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
index 3cb14f8555cab..3b13e43873c77 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
 // CHECK:    [[CONV0:%.*]] = fptrunc {{.*}} double %{{.*}} to float
diff --git a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
index 5c63d630c3f3c..d7aacdc486ac6 100644
--- a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_double(double p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_double2(double2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_double3(double3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_double4(double4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int(int p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int2(int2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int3(int3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int4(int4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint(uint p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint2(uint2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint3(uint3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint4(uint4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int64_t(int64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int64_t2(int64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int64_t3(int64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int64_t4(int64_t4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint64_t(uint64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint64_t2(uint64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint64_t3(uint64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint64_t4(uint64_t4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index e489939594a53..0136c1a052ed4 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_log_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_log_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_log_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_log_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 half test_log_half(half p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32(
 half2 test_log_half2(half2 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32(
 half3 test_log_half3(half3 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32(
 half4 test_log_half4(half4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_float(float p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_float2(float2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_float3(float3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
index 1a0539c3517d5..e408f4a5d45ce 100644
--- a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_double(double p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_double2(double2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_double3(double3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_double4(double4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int(int p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int2(int2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int3(int3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int4(int4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint(uint p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint2(uint2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint3(uint3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint4(uint4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int64_t(int64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int64_t2(int64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int64_t3(int64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int64_t4(int64_t4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint64_t(uint64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint64_t2(uint64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint64_t3(uint64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint64_t4(uint64_t4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 37c8e837c45a3..6a75444143b18 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_log10_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_log10_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log10.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log10_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log10_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 half test_log10_half(half p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log10.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32(
 half2 test_log10_half2(half2 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log10.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32(
 half3 test_log10_half3(half3 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log10.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32(
 half4 test_log10_half4(half4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_log10_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_log10_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_float(float p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_float2(float2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_float3(float3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_float4(float4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
index c35b50d8e490a..f88d5ab849212 100644
--- a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_double(double p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_double2(double2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_double3(double3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_double4(double4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int(int p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int2(int2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int3(int3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int4(int4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint(uint p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint2(uint2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint3(uint3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint4(uint4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int64_t(int64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int64_t2(int64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int64_t3(int64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int64_t4(int64_t4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint64_t(uint64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint64_t2(uint64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint64_t3(uint64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint64_t4(uint64_t4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 5159d5bb0fa4e..84d73c1810890 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_log2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_log2_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log2.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log2_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 half test_log2_half(half p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log2.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32(
 half2 test_log2_half2(half2 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log2.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32(
 half3 test_log2_half3(half3 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log2.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32(
 half4 test_log2_half4(half4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_float(float p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_float2(float2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_float3(float3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
index d952398a6a592..cd7013ba75825 100644
--- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.smax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MAX]]
 int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.umax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1);
 uint16_t4 test_max_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 int4 test_max_int4_mismatch(int4 p0, int p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 uint4 test_max_uint4_mismatch(uint4 p0, uint p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 int64_t4 test_max_long4_mismatch(int64_t4 p0, int64_t p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 uint64_t4 test_max_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.maxnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MAX]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MAX]]
 half4 test_max_half4_mismatch(half4 p0, half p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MAX]]
 float4 test_max_float4_mismatch(float4 p0, float p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MAX]]
 double4 test_max_double4_mismatch(double4 p0, double p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index 0b767335556ee..fab53a160c856 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -6,128 +6,128 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_max_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_max_short
 // NATIVE_HALF: call i16 @llvm.smax.i16(
 int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_max_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_max_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
 int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_max_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_max_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
 int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_max_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_max_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
 int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_max_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_max_ushort
 // NATIVE_HALF: call i16 @llvm.umax.i16(
 uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_max_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_max_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
 uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_max_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_max_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
 uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_max_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_max_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
 uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_max_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_max_int
 // CHECK: call i32 @llvm.smax.i32(
 int test_max_int(int p0, int p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_max_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_max_int2
 // CHECK: call <2 x i32> @llvm.smax.v2i32
 int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_max_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_max_int3
 // CHECK: call <3 x i32> @llvm.smax.v3i32
 int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_max_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_max_int4
 // CHECK: call <4 x i32> @llvm.smax.v4i32
 int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_max_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_max_uint
 // CHECK: call i32 @llvm.umax.i32(
 int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_max_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_max_uint2
 // CHECK: call <2 x i32> @llvm.umax.v2i32
 uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_max_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_max_uint3
 // CHECK: call <3 x i32> @llvm.umax.v3i32
 uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_max_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_max_uint4
 // CHECK: call <4 x i32> @llvm.umax.v4i32
 uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_max_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_max_long
 // CHECK: call i64 @llvm.smax.i64(
 int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_max_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_max_long2
 // CHECK: call <2 x i64> @llvm.smax.v2i64
 int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_max_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_max_long3
 // CHECK: call <3 x i64> @llvm.smax.v3i64
 int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_max_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_max_long4
 // CHECK: call <4 x i64> @llvm.smax.v4i64
 int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_max_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_max_ulong
 // CHECK: call i64 @llvm.umax.i64(
 uint64_t test_max_ulong(uint64_t p0, uint64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_max_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_max_ulong2
 // CHECK: call <2 x i64> @llvm.umax.v2i64
 uint64_t2 test_max_ulong2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_max_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_max_ulong3
 // CHECK: call <3 x i64> @llvm.umax.v3i64
 uint64_t3 test_max_ulong3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_max_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_max_ulong4
 // CHECK: call <4 x i64> @llvm.umax.v4i64
 uint64_t4 test_max_ulong4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_max_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_max_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.maxnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_max_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_max_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 half test_max_half(half p0, half p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.maxnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32(
 half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.maxnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32(
 half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.maxnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32(
 half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_max_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_max_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 float test_max_float(float p0, float p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32
 float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32
 float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32
 float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_max_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_max_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.maxnum.f64(
 double test_max_double(double p0, double p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.maxnum.v2f64
 double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.maxnum.v3f64
 double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.maxnum.v4f64
 double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
index 5c200f488c246..f81fa128ce9c7 100644
--- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.smin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MIN]]
 int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.umin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1);
 uint16_t4 test_min_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 int4 test_min_int4_mismatch(int4 p0, int p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 uint4 test_min_uint4_mismatch(uint4 p0, uint p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 int64_t4 test_min_long4_mismatch(int64_t4 p0, int64_t p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 uint64_t4 test_min_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.minnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MIN]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MIN]]
 half4 test_min_half4_mismatch(half4 p0, half p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MIN]]
 float4 test_min_float4_mismatch(float4 p0, float p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MIN]]
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index 508d8b68ea452..b3e8fedff9b1b 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -6,131 +6,131 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_min_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_min_short
 // NATIVE_HALF: call i16 @llvm.smin.i16(
 int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_min_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_min_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16(
 int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_min_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_min_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16
 int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_min_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_min_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16
 int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_min_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_min_ushort
 // NATIVE_HALF: call i16 @llvm.umin.i16(
 uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_min_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_min_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16
 uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_min_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_min_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16
 uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_min_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_min_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16
 uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_min_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_min_int
 // CHECK: call i32 @llvm.smin.i32(
 int test_min_int(int p0, int p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_min_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_min_int2
 // CHECK: call <2 x i32> @llvm.smin.v2i32
 int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_min_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_min_int3
 // CHECK: call <3 x i32> @llvm.smin.v3i32
 int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_min_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_min_int4
 // CHECK: call <4 x i32> @llvm.smin.v4i32
 int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_min_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_min_uint
 // CHECK: call i32 @llvm.umin.i32(
 int test_min_uint(uint p0, uint p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_min_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_min_uint2
 // CHECK: call <2 x i32> @llvm.umin.v2i32
 uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_min_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_min_uint3
 // CHECK: call <3 x i32> @llvm.umin.v3i32
 uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_min_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_min_uint4
 // CHECK: call <4 x i32> @llvm.umin.v4i32
 uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_min_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_min_long
 // CHECK: call i64 @llvm.smin.i64(
 int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_min_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_min_long2
 // CHECK: call <2 x i64> @llvm.smin.v2i64
 int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_min_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_min_long3
 // CHECK: call <3 x i64> @llvm.smin.v3i64
 int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_min_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_min_long4
 // CHECK: call <4 x i64> @llvm.smin.v4i64
 int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_min_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_min_ulong
 // CHECK: call i64 @llvm.umin.i64(
 uint64_t test_min_ulong(uint64_t p0, uint64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_min_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_min_ulong2
 // CHECK: call <2 x i64> @llvm.umin.v2i64
 uint64_t2 test_min_ulong2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_min_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_min_ulong3
 // CHECK: call <3 x i64> @llvm.umin.v3i64
 uint64_t3 test_min_ulong3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_min_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_min_ulong4
 // CHECK: call <4 x i64> @llvm.umin.v4i64
 uint64_t4 test_min_ulong4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_min_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_min_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.minnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_min_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_min_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 half test_min_half(half p0, half p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.minnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32(
 half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.minnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32(
 half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.minnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32(
 half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_min_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_min_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 float test_min_float(float p0, float p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32
 float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32
 float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32
 float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_min_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_min_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.minnum.f64(
 double test_min_double(double p0, double p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.minnum.v2f64
 double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.minnum.v3f64
 double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
index e9baa25fc6409..52ff7da94c4f7 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].normalize.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 830fc26b7acf0..cc2378756a50a 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].normalize.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/or.hlsl b/clang/test/CodeGenHLSL/builtins/or.hlsl
index 69c57c5455f7d..66cc5572a75b5 100644
--- a/clang/test/CodeGenHLSL/builtins/or.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/or.hlsl
@@ -2,7 +2,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-//CHECK-LABEL: define noundef i1 @_Z14test_or_scalarbb(
+//CHECK-LABEL: define hidden noundef i1 @_Z14test_or_scalarbb(
 //CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
@@ -12,7 +12,7 @@ bool test_or_scalar(bool x, bool y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
+//CHECK-LABEL: define hidden noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
 //CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <2 x i1> [[A:%.*]], [[B:%.*]]
@@ -22,7 +22,7 @@ bool2 test_or_bool2(bool2 x, bool2 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
+//CHECK-LABEL: define hidden noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
 //CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <3 x i1> [[A:%.*]], [[B:%.*]]
@@ -32,7 +32,7 @@ bool3 test_or_bool3(bool3 x, bool3 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
 //CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <4 x i1> [[A:%.*]], [[B:%.*]]
@@ -42,7 +42,7 @@ bool4 test_or_bool4(bool4 x, bool4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef i1 @_Z11test_or_intii(
+//CHECK-LABEL: define hidden noundef i1 @_Z11test_or_intii(
 //CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBBOL:%.*]] = icmp ne i32 [[A:%.*]], 0
@@ -54,7 +54,7 @@ bool test_or_int(int x, int y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
 //CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] = icmp ne <4 x i32> [[A:%.*]], zeroinitializer
@@ -66,7 +66,7 @@ bool4 test_or_int4(int4 x, int4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
 //CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] =  fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[A:%.*]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
index 39003aef7b7b5..0d1f3d3546a33 100644
--- a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
@@ -2,125 +2,125 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:  -DFLOATATTRS="reassoc nnan ninf nsz arcp afn"
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_double
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_double(double p0, double p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_double2(double2 p0, double2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_double3(double3 p0, double3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_double4(double4 p0, double4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int
 // CHECK: [[CONV0:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int(int p0, int p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int2(int2 p0, int2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int3(int3 p0, int3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int4(int4 p0, int4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint
 // CHECK: [[CONV0:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint(uint p0, uint p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint2(uint2 p0, uint2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint3(uint3 p0, uint3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_uint4(uint4 p0, uint4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
 // CHECK: [[CONV0:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int64_t(int64_t p0, int64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int64_t2(int64_t2 p0, int64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int64_t3(int64_t3 p0, int64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int64_t4(int64_t4 p0, int64_t4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
 // CHECK: [[CONV0:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint64_t(uint64_t p0, uint64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint64_t2(uint64_t2 p0, uint64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint64_t3(uint64_t3 p0, uint64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index fd21f1b94c57e..fcde755e15fcc 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_pow_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_pow_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.pow.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_pow_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_pow_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 half test_pow_half(half p0, half p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.pow.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32(
 half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.pow.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32(
 half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.pow.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32(
 half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_pow_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_pow_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 float test_pow_float(float p0, float p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32
 float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32
 float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32
 float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
index d0cfc7b60265b..4b12f590edcd6 100644
--- a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].radians.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
index efdeb9f6e142a..f281747fbf298 100644
--- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 
 // NATIVE_HALF: define [[FNATTRS]] half @
diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
index 8f07f3a031531..cdfaa3c5f1ee3 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
@@ -13,90 +13,90 @@
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) half @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) half @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) half @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn half 0xH3C00, %{{.*}} 
 // NATIVE_HALF: ret half %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) float @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) float @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) float @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // NO_HALF: ret float %hlsl.rcp
 half test_rcp_half(half p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <2 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <2 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <2 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <2 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <2 x float> %hlsl.rcp
 half2 test_rcp_half2(half2 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <3 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <3 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <3 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <3 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <3 x float> %hlsl.rcp
 half3 test_rcp_half3(half3 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <4 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <4 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <4 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <4 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <4 x float> %hlsl.rcp
 half4 test_rcp_half4(half4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) float @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) float @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) float @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // CHECK: ret float %hlsl.rcp
 float test_rcp_float(float p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x float> %hlsl.rcp
 float2 test_rcp_float2(float2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x float> %hlsl.rcp
 float3 test_rcp_float3(float3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x float> %hlsl.rcp
 float4 test_rcp_float4(float4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) double @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) double @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) double @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) double @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn double 1.000000e+00, %{{.*}} 
 // CHECK: ret double %hlsl.rcp
 double test_rcp_double(double p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x double> %hlsl.rcp
 double2 test_rcp_double2(double2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x double> %hlsl.rcp
 double3 test_rcp_double3(double3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x double> %hlsl.rcp
 double4 test_rcp_double4(double4 p0) { return rcp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
index c082e63ac1da6..65fefd801ffed 100644
--- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -15,7 +15,7 @@
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret half [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -28,7 +28,7 @@ half test_reflect_half(half I, half N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -39,7 +39,7 @@ half test_reflect_half(half I, half N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -49,7 +49,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -60,7 +60,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -70,7 +70,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -81,7 +81,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -91,7 +91,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -100,7 +100,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret float [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -113,7 +113,7 @@ float test_reflect_float(float I, float N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -124,7 +124,7 @@ float test_reflect_float(float I, float N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -134,7 +134,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -145,7 +145,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -155,7 +155,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
@@ -166,7 +166,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index fe137b9cae4e9..91375c8f4eb8f 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -3,25 +3,25 @@
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: call i16 @llvm.bitreverse.i16(
 uint16_t test_bitreverse_ushort(uint16_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: call <2 x i16> @llvm.bitreverse.v2i16
 uint16_t2 test_bitreverse_ushort2(uint16_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: call <3 x i16> @llvm.bitreverse.v3i16
 uint16_t3 test_bitreverse_ushort3(uint16_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: call <4 x i16> @llvm.bitreverse.v4i16
 uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 {
@@ -29,50 +29,50 @@ uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK: define hidden noundef i32 @
 // CHECK: call i32 @llvm.bitreverse.i32(
 int test_bitreverse_uint(uint p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i32> @
+// CHECK: define hidden noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.bitreverse.v2i32
 uint2 test_bitreverse_uint2(uint2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i32> @
+// CHECK: define hidden noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.bitreverse.v3i32
 uint3 test_bitreverse_uint3(uint3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i32> @
+// CHECK: define hidden noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.bitreverse.v4i32
 uint4 test_bitreverse_uint4(uint4 p0)
 {
 	return reversebits(p0);
 }
 
-// CHECK: define noundef i64 @
+// CHECK: define hidden noundef i64 @
 // CHECK: call i64 @llvm.bitreverse.i64(
 uint64_t test_bitreverse_long(uint64_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i64> @
+// CHECK: define hidden noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.bitreverse.v2i64
 uint64_t2 test_bitreverse_long2(uint64_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i64> @
+// CHECK: define hidden noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.bitreverse.v3i64
 uint64_t3 test_bitreverse_long3(uint64_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i64> @
+// CHECK: define hidden noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.bitreverse.v4i64
 uint64_t4 test_bitreverse_long4(uint64_t4 p0)
 {
diff --git a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
index 109633a64d34e..3b07fcec064d8 100644
--- a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_double
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_double(double p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_double2(double2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_double3(double3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_double4(double4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int(int p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int2(int2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int3(int3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int4(int4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint(uint p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint2(uint2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint3(uint3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint4(uint4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int64_t(int64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int64_t2(int64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int64_t3(int64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int64_t4(int64_t4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint64_t(uint64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint64_t2(uint64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint64_t3(uint64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint64_t4(uint64_t4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index a945a9677abbb..755f2e86fb116 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_round_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_round_half
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn half @llvm.roundeven.f16(
 // NATIVE_HALF: ret half %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_round_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_round_half
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // NO_HALF: ret float %elt.roundeven
 half test_round_half(half p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.roundeven.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32(
 // NO_HALF: ret <2 x float> %elt.roundeven
 half2 test_round_half2(half2 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.roundeven.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32(
 // NO_HALF: ret <3 x float> %elt.roundeven
 half3 test_round_half3(half3 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.roundeven.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32(
 // NO_HALF: ret <4 x float> %elt.roundeven
 half4 test_round_half4(half4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_round_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_round_float
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_float(float p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_float2(float2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_float3(float3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_float4(float4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
index 09f21f366b9d2..262f306b92572 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].rsqrt.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
index 6c9b1f643713b..9c398fd6f06cb 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].rsqrt.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index 8cc910933f462..cbdb929388934 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 
 // NATIVE_HALF: define [[FNATTRS]] i32 @
 // NATIVE_HALF: %hlsl.sign = call i32 @llvm.[[TARGET]].sign.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
index a5522e4f28b7f..e471cb3d42c5c 100644
--- a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_double(double p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_double2(double2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_double3(double3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_double4(double4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int(int p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int2(int2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int3(int3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int4(int4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint(uint p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint2(uint2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint3(uint3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint4(uint4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int64_t(int64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int64_t2(int64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int64_t3(int64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int64_t4(int64_t4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint64_t(uint64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint64_t2(uint64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint64_t3(uint64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint64_t4(uint64_t4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 69c657239ef95..9bbe97997aa33 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_sin_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_sin_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.sin.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_sin_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_sin_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 half test_sin_half(half p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sin.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32(
 half2 test_sin_half2(half2 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sin.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32(
 half3 test_sin_half3(half3 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sin.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32(
 half4 test_sin_half4(half4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sin_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sin_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_float(float p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_float2(float2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_float3(float3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
index d3e5c1059029c..bef64ce77d470 100644
--- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[MIN]]
@@ -19,7 +19,7 @@
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret half [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half nofpclass(nan inf) [[MIN]], half nofpclass(nan inf) [[MAX]], half nofpclass(nan inf) [[X]])
@@ -27,7 +27,7 @@
 //
 half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[MIN]]
@@ -40,7 +40,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> nofpclass(nan inf) [[MIN]], <2 x half> nofpclass(nan inf) [[MAX]], <2 x half> nofpclass(nan inf) [[X]])
@@ -48,7 +48,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 //
 half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[MIN]]
@@ -61,7 +61,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> nofpclass(nan inf) [[MIN]], <3 x half> nofpclass(nan inf) [[MAX]], <3 x half> nofpclass(nan inf) [[X]])
@@ -69,7 +69,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 //
 half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[MIN]]
@@ -82,7 +82,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> nofpclass(nan inf) [[MIN]], <4 x half> nofpclass(nan inf) [[MAX]], <4 x half> nofpclass(nan inf) [[X]])
@@ -90,7 +90,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 //
 half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[MIN]]
@@ -103,7 +103,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret float [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float nofpclass(nan inf) [[MIN]], float nofpclass(nan inf) [[MAX]], float nofpclass(nan inf) [[X]])
@@ -111,7 +111,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 //
 float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[MIN]]
@@ -124,7 +124,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> nofpclass(nan inf) [[MIN]], <2 x float> nofpclass(nan inf) [[MAX]], <2 x float> nofpclass(nan inf) [[X]])
@@ -132,7 +132,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 //
 float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[MIN]]
@@ -145,7 +145,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> nofpclass(nan inf) [[MIN]], <3 x float> nofpclass(nan inf) [[MAX]], <3 x float> nofpclass(nan inf) [[X]])
@@ -153,7 +153,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 //
 float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[MIN]]
@@ -166,7 +166,7 @@ float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> nofpclass(nan inf) [[MIN]], <4 x float> nofpclass(nan inf) [[MAX]], <4 x float> nofpclass(nan inf) [[X]])
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
index a883c9d5cc355..aeb2b79e90291 100644
--- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -8,7 +8,7 @@
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load double, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[CAST:%.*]] = bitcast double [[LOAD]] to <2 x i32>
@@ -26,7 +26,7 @@ uint test_scalar(double D) {
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <1 x double>, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[TRUNC:%.*]] = extractelement <1 x double> [[LOAD]], i64 0
@@ -44,7 +44,7 @@ uint1 test_double1(double1 D) {
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <2 x double>, ptr [[VALD]].addr, align 16
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <2 x double> [[LOAD]] to <4 x i32>
@@ -61,7 +61,7 @@ uint2 test_vector2(double2 D) {
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <3 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <3 x double> [[LOAD]] to <6 x i32>
@@ -78,7 +78,7 @@ uint3 test_vector3(double3 D) {
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT: @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <4 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <4 x double> [[LOAD]] to <8 x i32>
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
index 48b74c9db5c64..d4de244f38b3e 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_double(double p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_double2(double2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_double3(double3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_double4(double4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int(int p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int2(int2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int3(int3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int4(int4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint(uint p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint2(uint2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint3(uint3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint4(uint4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int64_t(int64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int64_t2(int64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int64_t3(int64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int64_t4(int64_t4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint64_t(uint64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint64_t2(uint64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint64_t3(uint64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint64_t4(uint64_t4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index 94d966f0bef8a..31839f6bc177d 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_sqrt_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_sqrt_half
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(
 // NATIVE_HALF: ret half %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sqrt_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sqrt_half
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // NO_HALF: ret float %{{.*}}
 half test_sqrt_half(half p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16
 // NATIVE_HALF: ret <2 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(
 // NO_HALF: ret <2 x float> %{{.*}}
 half2 test_sqrt_half2(half2 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16
 // NATIVE_HALF: ret <3 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(
 // NO_HALF: ret <3 x float> %{{.*}}
 half3 test_sqrt_half3(half3 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16
 // NATIVE_HALF: ret <4 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(
 // NO_HALF: ret <4 x float> %{{.*}}
 half4 test_sqrt_half4(half4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_sqrt_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_sqrt_float
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_float(float p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_float2(float2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_float3(float3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_float4(float4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
index d3b979254391c..f55a8f8aff92d 100644
--- a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].step.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 49d09e5c6fe6f..be0ffbd794646 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].step.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
index d913aabfb4066..51eb20c58e405 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
@@ -2,82 +2,82 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_double(double p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_double2(double2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_double3(double3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_double4(double4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int(int p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int2(int2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int3(int3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int4(int4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint(uint p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint2(uint2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint3(uint3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint4(uint4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int64_t(int64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int64_t2(int64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int64_t3(int64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int64_t4(int64_t4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint64_t(uint64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint64_t2(uint64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint64_t3(uint64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint64_t4(uint64_t4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 26de5bf94c3cc..c1c6ee4119f0d 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -5,42 +5,42 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_trunc_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_trunc_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.trunc.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_trunc_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_trunc_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 half test_trunc_half(half p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.trunc.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32(
 half2 test_trunc_half2(half2 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.trunc.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32(
 half3 test_trunc_half3(half3 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.trunc.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32(
 half4 test_trunc_half4(half4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_trunc_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_trunc_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_float(float p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_float2(float2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_float3(float3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 3ab8048146ad3..0df3598a3cc3e 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
 void main() {
 // CHECK: entry:
 // CHECK:   %[[CT_ENTRY:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 8e1f2d69e7432..9034cae254036 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -6,8 +6,8 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,CHECK-DXIL
 
-// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
-// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-SPIRV: define hidden spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-DXIL: define hidden noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
 // CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
 // CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
index 12b120d0c067d..a71b988417f09 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
 // CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
 uint test_1() {
@@ -10,7 +10,7 @@ uint test_1() {
 
 // CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 
-// CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_2v() [[A0]] {
 // CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func noundef i32 @_Z6test_1v() {{#[0-9]+}} [ "convergencectrl"(token %[[C2]]) ]
 uint test_2() {
diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl
index eebf0f682d3de..b58a49b41eb98 100644
--- a/clang/test/CodeGenHLSL/cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer.hlsl
@@ -46,14 +46,14 @@ cbuffer CBScalars : register(b1, space5) {
 
 // CHECK: @CBScalars.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars,
 // CHECK-SAME: 56, 0, 8, 16, 24, 32, 36, 40, 48))
-// CHECK: @a1 = external addrspace(2) global float, align 4
-// CHECK: @a2 = external addrspace(2) global double, align 8
-// CHECK: @a3 = external addrspace(2) global half, align 2
-// CHECK: @a4 = external addrspace(2) global i64, align 8
-// CHECK: @a5 = external addrspace(2) global i32, align 4
-// CHECK: @a6 = external addrspace(2) global i16, align 2
-// CHECK: @a7 = external addrspace(2) global i32, align 4
-// CHECK: @a8 = external addrspace(2) global i64, align 8
+// CHECK: @a1 = external hidden addrspace(2) global float, align 4
+// CHECK: @a2 = external hidden addrspace(2) global double, align 8
+// CHECK: @a3 = external hidden addrspace(2) global half, align 2
+// CHECK: @a4 = external hidden addrspace(2) global i64, align 8
+// CHECK: @a5 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a6 = external hidden addrspace(2) global i16, align 2
+// CHECK: @a7 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a8 = external hidden addrspace(2) global i64, align 8
 // CHECK: @CBScalars.str = private unnamed_addr constant [10 x i8] c"CBScalars\00", align 1
 
 cbuffer CBVectors {
@@ -69,13 +69,13 @@ cbuffer CBVectors {
 
 // CHECK: @CBVectors.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors,
 // CHECK-SAME: 136, 0, 16, 40, 48, 80, 96, 112))
-// CHECK: @b1 = external addrspace(2) global <3 x float>, align 16
-// CHECK: @b2 = external addrspace(2) global <3 x double>, align 32
-// CHECK: @b3 = external addrspace(2) global <2 x half>, align 4
-// CHECK: @b4 = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @b5 = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @b6 = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @b7 = external addrspace(2) global <3 x i64>, align 32
+// CHECK: @b1 = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @b2 = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @b3 = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @b4 = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @b5 = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @b6 = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @b7 = external hidden addrspace(2) global <3 x i64>, align 32
 // CHECK: @CBVectors.str = private unnamed_addr constant [10 x i8] c"CBVectors\00", align 1
 
 cbuffer CBArrays : register(b2) {
@@ -91,14 +91,14 @@ cbuffer CBArrays : register(b2) {
 
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays,
 // CHECK-SAME: 708, 0, 48, 112, 176, 224, 608, 624, 656))
-// CHECK: @c1 = external addrspace(2) global [3 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <3 x double>], align 32
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x half]], align 2
-// CHECK: @c4 = external addrspace(2) global [3 x i64], align 8
-// CHECK: @c5 = external addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
-// CHECK: @c6 = external addrspace(2) global [1 x i16], align 2
-// CHECK: @c7 = external addrspace(2) global [2 x i64], align 8
-// CHECK: @c8 = external addrspace(2) global [4 x i32], align 4
+// CHECK: @c1 = external hidden addrspace(2) global [3 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <3 x double>], align 32
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x half]], align 2
+// CHECK: @c4 = external hidden addrspace(2) global [3 x i64], align 8
+// CHECK: @c5 = external hidden addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
+// CHECK: @c6 = external hidden addrspace(2) global [1 x i16], align 2
+// CHECK: @c7 = external hidden addrspace(2) global [2 x i64], align 8
+// CHECK: @c8 = external hidden addrspace(2) global [4 x i32], align 4
 // CHECK: @CBArrays.str = private unnamed_addr constant [9 x i8] c"CBArrays\00", align 1
 
 typedef uint32_t4 uint32_t8[2];
@@ -112,8 +112,8 @@ cbuffer CBTypedefArray : register(space2) {
 
 // CHECK: @CBTypedefArray.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray,
 // CHECK-SAME: 128, 0, 64))
-// CHECK: @t1 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
-// CHECK: @t2 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t1 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t2 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
 // CHECK: @CBTypedefArray.str = private unnamed_addr constant [15 x i8] c"CBTypedefArray\00", align 1
 struct Empty {};
 
@@ -137,13 +137,13 @@ struct D {
 
 // CHECK: @CBStructs.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs,
 // CHECK-SAME: 246, 0, 16, 32, 64, 144, 238, 240))
-// CHECK: @a = external addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
-// CHECK: @b = external addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
-// CHECK: @c = external addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
-// CHECK: @array_of_A = external addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
-// CHECK: @d = external addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
-// CHECK: @e = external addrspace(2) global half, align 2
-// CHECK: @f = external addrspace(2) global <3 x i16>, align 8
+// CHECK: @a = external hidden addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
+// CHECK: @b = external hidden addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
+// CHECK: @c = external hidden addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
+// CHECK: @array_of_A = external hidden addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
+// CHECK: @d = external hidden addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
+// CHECK: @e = external hidden addrspace(2) global half, align 2
+// CHECK: @f = external hidden addrspace(2) global <3 x i16>, align 8
 // CHECK: @CBStructs.str = private unnamed_addr constant [10 x i8] c"CBStructs\00", align 1
 
 cbuffer CBStructs {
@@ -178,10 +178,10 @@ cbuffer CBClasses {
 
 // CHECK: @CBClasses.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses,
 // CHECK-SAME: 260, 0, 16, 32, 112))
-// CHECK: @k = external addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
-// CHECK: @l = external addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
-// CHECK: @m = external addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
-// CHECK: @ka = external addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
+// CHECK: @k = external hidden addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
+// CHECK: @l = external hidden addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
+// CHECK: @m = external hidden addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
+// CHECK: @ka = external hidden addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
 // CHECK: @CBClasses.str = private unnamed_addr constant [10 x i8] c"CBClasses\00", align 1
 
 struct Test {
@@ -190,16 +190,16 @@ struct Test {
 
 // CHECK: @CBMix.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix,
 // CHECK-SAME: 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168))
-// CHECK: @test = external addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
-// CHECK: @f1 = external addrspace(2) global float, align 4
-// CHECK: @f2 = external addrspace(2) global [3 x [2 x <2 x float>]], align 8
-// CHECK: @f3 = external addrspace(2) global float, align 4
-// CHECK: @f4 = external addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
-// CHECK: @f5 = external addrspace(2) global double, align 8
-// CHECK: @f6 = external addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
-// CHECK: @f7 = external addrspace(2) global float, align 4
-// CHECK: @f8 = external addrspace(2) global <1 x double>, align 8
-// CHECK: @f9 = external addrspace(2) global i16, align 2
+// CHECK: @test = external hidden addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
+// CHECK: @f1 = external hidden addrspace(2) global float, align 4
+// CHECK: @f2 = external hidden addrspace(2) global [3 x [2 x <2 x float>]], align 8
+// CHECK: @f3 = external hidden addrspace(2) global float, align 4
+// CHECK: @f4 = external hidden addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
+// CHECK: @f5 = external hidden addrspace(2) global double, align 8
+// CHECK: @f6 = external hidden addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
+// CHECK: @f7 = external hidden addrspace(2) global float, align 4
+// CHECK: @f8 = external hidden addrspace(2) global <1 x double>, align 8
+// CHECK: @f9 = external hidden addrspace(2) global i16, align 2
 // CHECK: @CBMix.str = private unnamed_addr constant [6 x i8] c"CBMix\00", align 1
 
 cbuffer CBMix {
diff --git a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
index 4a7e2597dc0ff..33f480bf445e5 100644
--- a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
@@ -8,14 +8,14 @@
 // CHECK: %"n0::Foo" = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n1::__cblayout_A", 4, 0))
-// CHECK: @_ZN2n02n11aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n02n11aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @B.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::__cblayout_B", 4, 0))
-// CHECK: @_ZN2n01aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n01aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @C.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n2::__cblayout_C", 20, 0, 16))
-// CHECK: @_ZN2n02n21aE = external addrspace(2) global float, align 4
-// CHECK: external addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
+// CHECK: @_ZN2n02n21aE = external hidden addrspace(2) global float, align 4
+// CHECK: external hidden addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
 
 namespace n0 {
   struct Foo {
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
index 0d092f0c36c29..16d22a5b1fdd4 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
@@ -6,9 +6,9 @@
 // CHECK: %__cblayout_CB_1 = type <{ float, <2 x float> }>
 
 // CHECK: @CB.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 176, 16, 168, 88))
-// CHECK: @a = external addrspace(2) global float, align 4
-// CHECK: @b = external addrspace(2) global double, align 8
-// CHECK: @c = external addrspace(2) global <2 x i32>, align 8
+// CHECK: @a = external hidden addrspace(2) global float, align 4
+// CHECK: @b = external hidden addrspace(2) global double, align 8
+// CHECK: @c = external hidden addrspace(2) global <2 x i32>, align 8
 // CHECK: @CB.str = private unnamed_addr constant [3 x i8] c"CB\00", align 1
 
 cbuffer CB : register(b1, space3) {
@@ -18,8 +18,8 @@ cbuffer CB : register(b1, space3) {
 }
 
 // CHECK: @CB.cb.1 = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_1, 92, 88, 80))
-// CHECK: @x = external addrspace(2) global float, align 4
-// CHECK: @y = external addrspace(2) global <2 x float>, align 8
+// CHECK: @x = external hidden addrspace(2) global float, align 4
+// CHECK: @y = external hidden addrspace(2) global <2 x float>, align 8
 
 // Missing packoffset annotation will produce a warning.
 // Element x will be placed after the element y that has an explicit packoffset.
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
index a6034386ac450..cda231d8d2ebb 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
@@ -3,7 +3,7 @@
 // CHECK: %__cblayout_A = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_A, 4, 0))
-// CHECK: @a = external addrspace(2) global float, align 4
+// CHECK: @a = external hidden addrspace(2) global float, align 4
 // CHECK-DAG: @_ZL1b = internal global float 3.000000e+00, align 4
 // CHECK-NOT: @B.cb
 
diff --git a/clang/test/CodeGenHLSL/convergence/do.while.hlsl b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
index 934fe3ea9eb7a..9aabbfd54e539 100644
--- a/clang/test/CodeGenHLSL/convergence/do.while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   do {
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -22,7 +22,7 @@ void test2() {
     foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
       foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -57,7 +57,7 @@ void test4() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -78,7 +78,7 @@ void test5() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/for.hlsl b/clang/test/CodeGenHLSL/convergence/for.hlsl
index 363c6a48839b5..b7b11e9959ea8 100644
--- a/clang/test/CodeGenHLSL/convergence/for.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/for.hlsl
@@ -10,7 +10,7 @@ void test1() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -23,7 +23,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -52,7 +52,7 @@ void test4() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -67,7 +67,7 @@ void test5() {
   for (cond();cond2();foo()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -86,7 +86,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -112,7 +112,7 @@ void test7() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test7v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test7v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/while.hlsl b/clang/test/CodeGenHLSL/convergence/while.hlsl
index 570b4b1336717..32579e8631001 100644
--- a/clang/test/CodeGenHLSL/convergence/while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   while (cond()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -21,7 +21,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -59,7 +59,7 @@ void test4() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -82,7 +82,7 @@ void test5() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -107,7 +107,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/default_cbuffer.hlsl b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
index 557913042e884..ad4d92f8afc02 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
@@ -6,14 +6,14 @@
 // CHECK: %__cblayout_S = type <{ float }>
 
 // DXIL-DAG: @"$Globals.cb" = global target("dx.CBuffer", target("dx.Layout", %"__cblayout_$Globals", 20, 0, 4, 16))
-// DXIL-DAG: @a = external addrspace(2) global float
-// DXIL-DAG: @g = external addrspace(2) global float
-// DXIL-DAG: @h = external addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
+// DXIL-DAG: @a = external hidden addrspace(2) global float
+// DXIL-DAG: @g = external hidden addrspace(2) global float
+// DXIL-DAG: @h = external hidden addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
 
 // SPIRV-DAG: @"$Globals.cb" = global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 20, 0, 4, 16), 2, 0)
-// SPIRV-DAG: @a = external addrspace(12) global float
-// SPIRV-DAG: @g = external addrspace(12) global float
-// SPIRV-DAG: @h = external addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
+// SPIRV-DAG: @a = external hidden addrspace(12) global float
+// SPIRV-DAG: @g = external hidden addrspace(12) global float
+// SPIRV-DAG: @h = external hidden addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
 
 struct EmptyStruct {
 };
diff --git a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
index 40e3196649a50..1b2cb0e99aa83 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
@@ -4,14 +4,14 @@
 // CHECK-SAME: target("dx.Layout", %S, 8, 0) }>
 // CHECK: %S = type <{ <2 x float> }>
 
-// CHECK-DAG: @b = external addrspace(2) global float, align 4
-// CHECK-DAG: @d = external addrspace(2) global <4 x i32>, align 16
+// CHECK-DAG: @b = external hidden addrspace(2) global float, align 4
+// CHECK-DAG: @d = external hidden addrspace(2) global <4 x i32>, align 16
 // CHECK-DAG: @"$Globals.cb" = global target("dx.CBuffer",
 // CHECK-DAG-SAME: target("dx.Layout", %"__cblayout_$Globals", 144, 120, 16, 32, 64, 128, 112))
-// CHECK-DAG: @a = external addrspace(2) global i32, align 4
-// CHECK-DAG: @c = external addrspace(2) global [4 x double], align 8
-// CHECK-DAG: @e = external addrspace(2) global <4 x float>, align 16
-// CHECK-DAG: @s = external addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
+// CHECK-DAG: @a = external hidden addrspace(2) global i32, align 4
+// CHECK-DAG: @c = external hidden addrspace(2) global [4 x double], align 8
+// CHECK-DAG: @e = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK-DAG: @s = external hidden addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
 
 struct S {
   float2 v;
diff --git a/clang/test/CodeGenHLSL/export.hlsl b/clang/test/CodeGenHLSL/export.hlsl
index 770618ff2e070..e72dbde5188a9 100644
--- a/clang/test/CodeGenHLSL/export.hlsl
+++ b/clang/test/CodeGenHLSL/export.hlsl
@@ -5,17 +5,15 @@
 export void f1() {
 }
 
-// CHECK: define void @_ZN11MyNamespace2f2Ev() [[Attr]]
+// CHECK: define void @_ZN11MyNamespace2f2Ev()
 namespace MyNamespace {
   export void f2() {
   }
 }
 
 export {
-// CHECK: define void @_Z2f3v() [[Attr]]
-// CHECK: define void @_Z2f4v() [[Attr]]
+// CHECK: define void @_Z2f3v()
+// CHECK: define void @_Z2f4v()
     void f3() {}
     void f4() {}
-}
-
-// CHECK: attributes [[Attr]] = { {{.*}} "hlsl.export" {{.*}} }
+}
\ No newline at end of file
diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl
index a562e75b34881..6498c53752d4e 100644
--- a/clang/test/CodeGenHLSL/group_shared.hlsl
+++ b/clang/test/CodeGenHLSL/group_shared.hlsl
@@ -8,7 +8,7 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure groupshared translated into address space 3.
-// CHECK:@a = addrspace(3) global [10 x float]
+// CHECK:@a = hidden addrspace(3) global [10 x float]
 
  groupshared float a[10];
 
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
index 12d3eeedb590f..60238cbf8eff5 100644
--- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -12,7 +12,7 @@ struct Node {
 };
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
+// CHECK: define hidden noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Find and return value corresponding to key in the SortedTree
 uint Find(Node SortedTree[MAX], uint key) {
@@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) {
 }
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[Attr:\#[0-9]+]]
 // CHECK: ret i1
 // Initialize tree with given buffer
 // Imagine the inout works
@@ -52,7 +52,7 @@ RWBuffer<uint4> gTree;
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -71,7 +71,7 @@ void main(uint GI : SV_GroupIndex) {
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z11defaultMainv() [[IntAttr]]
+// CHECK: define internal void @_Z11defaultMainv() [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -88,6 +88,5 @@ void defaultMain() {
     needle = Find(haystack, needle);
 }
 
-// CHECK: attributes [[IntAttr]] = {{.*}} norecurse
-// CHECK: attributes [[ExtAttr]] = {{.*}} norecurse
+// CHECK: attributes [[Attr]] = {{.*}} norecurse
 // CHECK: attributes [[EntryAttr]] = {{.*}} norecurse
diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl
index 4748eeee7475f..0c7467e2f972e 100644
--- a/clang/test/CodeGenHLSL/inline-functions.hlsl
+++ b/clang/test/CodeGenHLSL/inline-functions.hlsl
@@ -15,7 +15,7 @@ float nums[MAX];
 
 // Verify that all functions have the alwaysinline attribute
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]]
+// NOINLINE: define hidden void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[Attr:\#[0-9]+]]
 // NOINLINE: ret void
 // Swap the values of Buf at indices ix1 and ix2
 void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
@@ -25,7 +25,7 @@ void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
 }
 
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]]
+// NOINLINE: define hidden void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[Attr]]
 // NOINLINE: ret void
 // Inefficiently sort Buf in place
 void BubbleSort(unsigned Buf[MAX], unsigned size) {
@@ -43,7 +43,7 @@ void BubbleSort(unsigned Buf[MAX], unsigned size) {
 
 // Note ExtAttr is the inlined export set of attribs
 // CHECK: Function Attrs: alwaysinline
-// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Sort Buf and remove any duplicate values
 // returns the number of values left
@@ -65,9 +65,9 @@ RWBuffer<unsigned> Indices;
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -93,9 +93,9 @@ void main(unsigned int GI : SV_GroupIndex) {
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z6main10v() [[IntAttr]]
+// NOINLINE: define internal void @_Z6main10v() [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -113,6 +113,5 @@ void main10() {
   main(10);
 }
 
-// NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline
-// CHECK: attributes [[ExtAttr]] = {{.*}} alwaysinline
+// CHECK: attributes [[Attr]] = {{.*}} alwaysinline
 // CHECK: attributes [[EntryAttr]] = {{.*}} noinline
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
index 5b58e436bbedb..7149be0122f4d 100644
--- a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
+++ b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
@@ -18,12 +18,12 @@ struct S {
     Int i;
 };
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
 ArrayBuffer<4> getArrayBuffer(ArrayBuffer<4> v) {
     return v;
 }
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
 Int getInt(Int v) {
     return v;
 }
diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
index 78bff3b13810d..b4ffcb477f1ba 100644
--- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
+++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
@@ -10,37 +10,37 @@
 int16_t add(int16_t a, int16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 int16_t2 add(int16_t2 a, int16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 int16_t3 add(int16_t3 a, int16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 int16_t4 add(int16_t4 a, int16_t4 b) {
   return a + b;
 }
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: add i16 %
 uint16_t add(uint16_t a, uint16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 uint16_t2 add(uint16_t2 a, uint16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 uint16_t3 add(uint16_t3 a, uint16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 uint16_t4 add(uint16_t4 a, uint16_t4 b) {
   return a + b;
diff --git a/clang/test/CodeGenHLSL/out-of-line-static.hlsl b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
index 8127a6c2ec1e4..57f6c123e50e5 100644
--- a/clang/test/CodeGenHLSL/out-of-line-static.hlsl
+++ b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
@@ -6,8 +6,8 @@ struct S {
 };
 
 int S::Value = 1;
-// DXIL: @_ZN1S5ValueE = global i32 1, align 4
-// SPIRV: @_ZN1S5ValueE = addrspace(10) global i32 1, align 4
+// DXIL: @_ZN1S5ValueE = hidden global i32 1, align 4
+// SPIRV: @_ZN1S5ValueE = hidden addrspace(10) global i32 1, align 4
 
 [shader("compute")]
 [numthreads(1,1,1)]
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
index 7b3890ae560d2..41e05330ed1a5 100644
--- a/clang/test/CodeGenHLSL/shift-mask.hlsl
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -5,7 +5,7 @@ int shl32(int V, int S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -13,7 +13,7 @@ int shr32(int V, int S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
 
@@ -21,7 +21,7 @@ int64_t shl64(int64_t V, int64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -29,7 +29,7 @@ int64_t shr64(int64_t V, int64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
 
@@ -37,7 +37,7 @@ uint shlu32(uint V, uint S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -45,7 +45,7 @@ uint shru32(uint V, uint S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = lshr i32 %{{.*}}, %[[Masked]]
 
@@ -53,7 +53,7 @@ uint64_t shlu64(uint64_t V, uint64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -61,6 +61,6 @@ uint64_t shru64(uint64_t V, uint64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = lshr i64 %{{.*}}, %[[Masked]]
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index a87eb0b38f603..a2df307038774 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -25,7 +25,7 @@ void main() {
 }
 
 // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 1
@@ -42,7 +42,7 @@ void main() {
 // CHECK-NEXT:%0 = load i32, ptr %First2, align 1
 // CHECK-NEXT:ret i32 %0
 
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 1
diff --git a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
index 1cc7963c0e289..157a1818c82ff 100644
--- a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
@@ -3,7 +3,7 @@
 
 [[vk::ext_builtin_input(/* WorkgroupId */ 26)]]
 static const uint3 groupid;
-// CHECK: @_ZL7groupid = external local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
+// CHECK: @_ZL7groupid = external hidden local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
 
 RWStructuredBuffer<int> output : register(u1, space0);
 
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
new file mode 100644
index 0000000000000..0214639ed3804
--- /dev/null
+++ b/clang/test/Driver/aarch64-gnutools.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
new file mode 100644
index 0000000000000..2610e962bd690
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -0,0 +1,28 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in aarch64-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/aarch64-nogcc/bin
+// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
new file mode 100644
index 0000000000000..7f2c01d928e43
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -0,0 +1,61 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
+
+// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
+
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
diff --git a/clang/test/Driver/android-link.cpp b/clang/test/Driver/android-link.cpp
index ab7dae5405587..b103263cdd3f0 100644
--- a/clang/test/Driver/android-link.cpp
+++ b/clang/test/Driver/android-link.cpp
@@ -16,6 +16,16 @@
 // RUN: FileCheck -check-prefix=CORTEX-A57 < %t %s
 
 // RUN: %clang --target=aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 -mfix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN2 < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
 // RUN:   -### -v %s 2> %t
 // RUN: FileCheck -check-prefix=MAX-PAGE-SIZE-16KB < %t %s
 
@@ -31,6 +41,8 @@
 // GENERIC-ARM: --fix-cortex-a53-843419
 // CORTEX-A53: --fix-cortex-a53-843419
 // CORTEX-A57-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN2: --fix-cortex-a53-843419
 // MAX-PAGE-SIZE-4KB: "-z" "max-page-size=4096"
 // MAX-PAGE-SIZE-16KB: "-z" "max-page-size=16384"
 // NO-MAX-PAGE-SIZE-16KB-NOT: "-z" "max-page-size=16384"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
new file mode 100644
index 0000000000000..6e107f19dabc5
--- /dev/null
+++ b/clang/test/Driver/arm-gnutools.c
@@ -0,0 +1,6 @@
+// check that gnu assembler is invoked with arm baremetal as well
+
+// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
new file mode 100644
index 0000000000000..114de0a8154ab
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -0,0 +1,29 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in arm-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/arm-nogcc/bin
+// RUN: ln -s %clang %t/arm-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
new file mode 100644
index 0000000000000..2e38461fb7a3e
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain.c
@@ -0,0 +1,62 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
+
+// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
+
+// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
+
+// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index a80aa9b437117..2ac83402dda30 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -196,6 +196,22 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
+// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
+// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
new file mode 100644
index 0000000000000..9a0d7cee450a3
--- /dev/null
+++ b/clang/test/Driver/check-no-multlib-warning.c
@@ -0,0 +1,10 @@
+// UNSUPPORTED: system-windows
+
+
+// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+
+// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
+// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 0535285862b9f..eb079895a0a88 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -821,7 +821,11 @@
 // ARM64EC_OVERRIDE: warning: /arm64EC has been overridden by specified target: x86_64-pc-windows-msvc; option ignored
 
 // RUN: %clang_cl /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWIND
-// EPILOGUNWIND: -fwinx64-eh-unwindv2
+// EPILOGUNWIND: -fwinx64-eh-unwindv2=best-effort
+
+// RUN: %clang_cl /d2epilogunwindrequirev2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// RUN: %clang_cl /d2epilogunwindrequirev2 /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// EPILOGUNWINDREQUIREV2: -fwinx64-eh-unwindv2=require
 
 // RUN: %clang_cl /funcoverride:override_me1 /funcoverride:override_me2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=FUNCOVERRIDE
 // FUNCOVERRIDE: -loader-replaceable-function=override_me1
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 5420555c36a2a..c57e9aa7a3cc2 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -1,6 +1,7 @@
 // RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck --check-prefix=CHECK-NOLIB %s
 // RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck --check-prefix=CHECK-ACCELERATE %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-libmvec %s
+// RUN: %clang -### -c --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-LIBMVEC-AARCH64 %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-AMDLIBM %s
 // RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck --check-prefix=CHECK-MASSV %s
 // RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s
@@ -12,6 +13,7 @@
 // CHECK-NOLIB: "-fveclib=none"
 // CHECK-ACCELERATE: "-fveclib=Accelerate"
 // CHECK-libmvec: "-fveclib=libmvec"
+// CHECK-LIBMVEC-AARCH64: "-fveclib=libmvec"
 // CHECK-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-MASSV: "-fveclib=MASSV"
 // CHECK-DARWIN_LIBSYSTEM_M: "-fveclib=Darwin_libsystem_m"
@@ -23,7 +25,6 @@
 
 // RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
-// RUN: not %clang --target=aarch64 -c -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // CHECK-ERROR: unsupported option {{.*}} for target
@@ -43,6 +44,9 @@
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
 // CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC-AARCH64 %s
+// CHECK-LTO-LIBMVEC-AARCH64: "-plugin-opt=-vector-library=LIBMVEC"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-AMDLIBM %s
 // CHECK-LTO-AMDLIBM: "-plugin-opt=-vector-library=AMDLIBM"
 
@@ -68,6 +72,10 @@
 // CHECK-ERRNO-LIBMVEC: "-fveclib=libmvec"
 // CHECK-ERRNO-LIBMVEC-SAME: "-fmath-errno"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-LIBMVEC-AARCH64 %s
+// CHECK-ERRNO-LIBMVEC-AARCH64: "-fveclib=libmvec"
+// CHECK-ERRNO-LIBMVEC-AARCH64-SAME: "-fmath-errno"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-AMDLIBM %s
 // CHECK-ERRNO-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-ERRNO-AMDLIBM-SAME: "-fmath-errno"
diff --git a/clang/test/Driver/ignored-pch.cpp b/clang/test/Driver/ignored-pch.cpp
new file mode 100644
index 0000000000000..a3597dc0fe0d4
--- /dev/null
+++ b/clang/test/Driver/ignored-pch.cpp
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// Create PCH without -ignore-pch.
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-EMIT-PCH
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -o %t/pchfile.h.pch
+// RUN: %clang %s -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-INCLUDE-PCH
+// RUN: %clang %s -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefixes=CHECK-EMIT-PCH,CHECK-INCLUDE-PCH
+
+
+// Create PCH with -ignore-pch.
+// RUN: %clang -x c++-header -ignore-pch %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -include-pch  %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+
+// CHECK-EMIT-PCH: -emit-pch
+// CHECK-INCLUDE-PCH: -include-pch
+// CHECK-IGNORE-PCH-NOT: -emit-pch
+// CHECK-IGNORE-PCH-NOT: -include-pch
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index cfb4d0ed58d11..d8b3848d84520 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,12 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index 3c3c554dffc57..a0a1c35911409 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 70100a0a8df13..3f933ecd8ac83 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,12 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index d2b1a32e321e5..6460d701411bc 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 1a2c30bfc7a2e..4d9c514b756e6 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 50c38da3bd034..5eaada3f9e164 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 95464f06378e2..5008c2b7f789d 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -159,6 +159,7 @@
 // CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
 // CHECK-NEXT:     svvptc               1.0       'Svvptc' (Obviating Memory-Management Instructions after Marking PTEs Valid)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
+// CHECK-NEXT:     xandesvbfhcvt        5.0       'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)
 // CHECK-NEXT:     xandesvdot           5.0       'XAndesVDot' (Andes Vector Dot Product Extension)
 // CHECK-NEXT:     xandesvpackfph       5.0       'XAndesVPackFPH' (Andes Vector Packed FP16 Extension)
 // CHECK-NEXT:     xcvalu               1.0       'XCValu' (CORE-V ALU Operations)
@@ -213,7 +214,7 @@
 // CHECK-NEXT:     smctr                1.0       'Smctr' (Control Transfer Records Machine Level)
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
-// CHECK-NEXT:     xqccmp               0.1       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
+// CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
 // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
 // CHECK-NEXT:     xqciac               0.3       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcibi               0.2       'Xqcibi' (Qualcomm uC Branch Immediate Extension)
@@ -221,14 +222,14 @@
 // CHECK-NEXT:     xqcicli              0.3       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
 // CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
-// CHECK-NEXT:     xqcicsr              0.3       'Xqcicsr' (Qualcomm uC CSR Extension)
-// CHECK-NEXT:     xqciint              0.7       'Xqciint' (Qualcomm uC Interrupts Extension)
+// CHECK-NEXT:     xqcicsr              0.4       'Xqcicsr' (Qualcomm uC CSR Extension)
+// CHECK-NEXT:     xqciint              0.10      'Xqciint' (Qualcomm uC Interrupts Extension)
 // CHECK-NEXT:     xqciio               0.1       'Xqciio' (Qualcomm uC External Input Output Extension)
 // CHECK-NEXT:     xqcilb               0.2       'Xqcilb' (Qualcomm uC Long Branch Extension)
 // CHECK-NEXT:     xqcili               0.2       'Xqcili' (Qualcomm uC Load Large Immediate Extension)
 // CHECK-NEXT:     xqcilia              0.2       'Xqcilia' (Qualcomm uC Large Immediate Arithmetic Extension)
 // CHECK-NEXT:     xqcilo               0.3       'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)
-// CHECK-NEXT:     xqcilsm              0.5       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
+// CHECK-NEXT:     xqcilsm              0.6       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
 // CHECK-NEXT:     xqcisim              0.2       'Xqcisim' (Qualcomm uC Simulation Hint Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
 // CHECK-NEXT:     xqcisync             0.3       'Xqcisync' (Qualcomm uC Sync Delay Extension)
diff --git a/clang/test/Modules/preferred_name_header_unit.cpp b/clang/test/Modules/preferred_name_header_unit.cpp
new file mode 100644
index 0000000000000..b1f1e3579f31e
--- /dev/null
+++ b/clang/test/Modules/preferred_name_header_unit.cpp
@@ -0,0 +1,64 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-name=h1.h -emit-header-unit -xc++-user-header h1.h -o h1.pcm
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-map-file=module.modulemap -fmodule-file=h1.h=h1.pcm main.cpp -o main.o
+
+//--- module.modulemap
+module "h1.h" {
+  header "h1.h"
+  export *
+}
+
+//--- h0.h
+// expected-no-diagnostics
+#pragma once
+namespace std {
+
+template <class _CharT, class = _CharT, class = _CharT> class basic_string;
+
+namespace pmr {
+using string = basic_string<char>;
+}
+
+template <class, class, class>
+class __attribute__((__preferred_name__(pmr::string))) basic_string;
+
+template <class> class basic_string_view {};
+
+template <class _CharT, class _Traits, class _Allocator> class basic_string {
+  typedef _CharT value_type;
+  typedef _Allocator allocator_type;
+  struct __rep;
+public:
+  template <class _Tp>
+  basic_string(_Tp) {}
+  basic_string operator+=(value_type);
+};
+
+namespace filesystem {
+class path {
+  typedef char value_type;
+  value_type preferred_separator;
+  typedef basic_string<value_type> string_type;
+  typedef basic_string_view<value_type> __string_view;
+  template <class _Source> void append(_Source) {
+    __pn_ += preferred_separator;
+  }
+  void __root_directory() { append(string_type(__string_view{})); }
+  string_type __pn_;
+};
+} // namespace filesystem
+} // namespace std
+
+//--- h1.h
+// expected-no-diagnostics
+#pragma once
+
+#include "h0.h"
+
+//--- main.cpp
+// expected-no-diagnostics
+#include "h0.h"
+
+import "h1.h";
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index aad8836db1062..f37d00503fe57 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -4,7 +4,7 @@
 // SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
-// valid: define spir_func void @{{.*main.*}}() #0 {
+// valid: define hidden spir_func void @{{.*main.*}}() #0 {
 
 [numthreads(1,1,1)]
 void main()
diff --git a/clang/test/PCH/Inputs/ignored-pch.h b/clang/test/PCH/Inputs/ignored-pch.h
new file mode 100644
index 0000000000000..56047037c331f
--- /dev/null
+++ b/clang/test/PCH/Inputs/ignored-pch.h
@@ -0,0 +1,6 @@
+#ifndef IGNORED_PCH_H
+#define IGNORED_PCH_H
+inline int f() {
+  return 42;
+}
+#endif // IGNORED_PCH_H
diff --git a/clang/test/PCH/ignored-pch.c b/clang/test/PCH/ignored-pch.c
new file mode 100644
index 0000000000000..5b64582cba618
--- /dev/null
+++ b/clang/test/PCH/ignored-pch.c
@@ -0,0 +1,113 @@
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -emit-ast -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Check that -ignore-pch causes -emit-pch and -include-pch options to be ignored.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -emit-ast %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ-ERROR %s
+
+// Check that -ignore-pch works for multiple PCH related options.
+// Test with -building-pch-with-obj.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -building-pch-with-obj -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -building-pch-with-obj -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-compiler-errors.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-different-modules-cache-path.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.pch
+// RUN: %clang -S -emit-llvm %s -ignore-pch -include-pch %t.pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-codegen.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-codegen -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-codegen -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-debuginfo.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-debuginfo -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-debuginfo -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-instantiate-templates.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-instantiate-templates -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-instantiate-templates -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-pch-timestamp.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-pch-timestamp -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-pch-timestamp -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-validate-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-validate-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-validate-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -relocatable-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -relocatable-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -relocatable-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -pch-through-hdrstop-create/-pch-through-hdrstop-use
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -pch-through-hdrstop-create -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -pch-through-hdrstop-use -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+
+// Test with AST dump output:
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -include-pch %t.pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST-PCH %s
+// RUN: %clang %s -include-pch %t.pch -ignore-pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST %s
+
+// CHECK-PCH: ignored-pch.c.{{.*}}.pch
+// CHECK-OBJ: ignored-pch.c.{{.*}}.ll
+// CHECK-PCH-ERROR: ignored-pch.c.{{.*}}.pch{{'?}}: No such file or directory
+// CHECK-OBJ-ERROR: ignored-pch.c.{{.*}}.ll{{'?}}: No such file or directory
+// CHECK-AST-PCH: <undeserialized declarations>
+// CHECK-AST-NOT: <undeserialized declarations>
+
+#pragma hdrstop
+#include "Inputs/ignored-pch.h"
+int main() {
+  return f();
+}
diff --git a/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
new file mode 100644
index 0000000000000..94fa8c8c820a5
--- /dev/null
+++ b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// issue144264
+constexpr void test() 
+{ 
+    using TT = struct T[; 
+    // expected-error@-1 {{expected expression}}
+}
diff --git a/clang/test/Preprocessor/riscv-target-features-andes.c b/clang/test/Preprocessor/riscv-target-features-andes.c
index 3cd9b04354132..c66d4427b5cf2 100644
--- a/clang/test/Preprocessor/riscv-target-features-andes.c
+++ b/clang/test/Preprocessor/riscv-target-features-andes.c
@@ -15,6 +15,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESPERF %s
 // CHECK-XANDESPERF: __riscv_xandesperf  5000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// CHECK-XANDESVBFHCVT: __riscv_xandesvbfhcvt  5000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_xandesvpackfph -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVPACKFPH %s
diff --git a/clang/test/Sema/gh87867.c b/clang/test/Sema/gh87867.c
new file mode 100644
index 0000000000000..0568c734424ca
--- /dev/null
+++ b/clang/test/Sema/gh87867.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 %s
+
+// Compound literal doesn't need a constant expression inside a initializer-list if it is already inside a function 
+// see: https://github.com/llvm/llvm-project/issues/87867
+int foo(int *a, int b) {
+    return 0;
+}
+
+int x;
+struct{int t;} a = (struct {
+    typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t; // expected-error {{initializer element is not a compile-time constant}}
+}){0};
+
+void inside_a_func(){
+    int x;
+    (void)(struct {
+        typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t;
+    }){0};
+}
+
+// see: https://github.com/llvm/llvm-project/issues/143613
+#define bitcast(type, value) \
+    (((union{ typeof(value) src; type dst; }){ (value) }).dst)
+
+double placeholder = 10.0;
+double bar = bitcast(double, placeholder);  // expected-error {{initializer element is not a compile-time constant}}
+
+int main(void)
+{
+    int foo = 4;
+    foo = bitcast(int, bitcast(double, foo));
+    return 0;
+}
diff --git a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
index c775fe71069df..66981acf87a8a 100644
--- a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
+++ b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
@@ -154,3 +154,17 @@ namespace narrowing {
     // expected-note {{insert an explicit cast to silence this issue}}
   }
 }
+
+struct GH99680 {
+  static const int x1 = 1/(1-__builtin_is_constant_evaluated()); // expected-error {{in-class initializer for static data member is not a constant expression}} \
+    // expected-note {{division by zero}}
+  static const int x2 = __builtin_is_constant_evaluated();
+  static_assert(x2 == 1);
+  static const float x3 = 1/(1-__builtin_is_constant_evaluated());  // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}} \
+  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // expected-note {{division by zero}}
+  static const float x4 = __builtin_is_constant_evaluated(); // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}}
+  static_assert(fold(x4 == 1));
+};
diff --git a/clang/test/SemaCXX/class.cpp b/clang/test/SemaCXX/class.cpp
index 2f59544e7f36c..f1e02d5158aac 100644
--- a/clang/test/SemaCXX/class.cpp
+++ b/clang/test/SemaCXX/class.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -Wc++11-compat %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s -std=c++98
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -Wc++11-compat %s -std=c++98
 class C {
 public:
   auto int errx; // expected-error {{storage class specified for a member declaration}}
@@ -32,7 +32,7 @@ class C {
   int : 1, : 2;
   typedef int E : 1; // expected-error {{typedef member 'E' cannot be a bit-field}}
   static int sb : 1; // expected-error {{static member 'sb' cannot be a bit-field}}
-  static int vs;
+  static int vs; // cxx11-note {{declared here}}
 
   typedef int func();
   func tm;
@@ -48,20 +48,28 @@ class C {
 #endif
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{read of non-const variable 'vs' is not allowed in a constant expression}} \
+  // cxx98-note {{subexpression not valid in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // ok, illegal in C++11
 #if __cplusplus >= 201103L
   // expected-error@-2 {{static const volatile data member must be initialized out of line}}
 #endif
   static const E evi = 0;
-  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-                                               // expected-warning@-1 {{overflow in expression}}
-  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                               // cxx11-note {{value 1000000000000 is outside the range of representable values of type 'int'}} \
+                                               // expected-warning {{overflow in expression}}
+  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                           // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}\
+                                            // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{left shift of negative value -1}}
 
   void m() {
     sx = 0;
diff --git a/clang/test/SemaCXX/cxx0x-class.cpp b/clang/test/SemaCXX/cxx0x-class.cpp
index a612a5c07e6ed..4b54221cceff2 100644
--- a/clang/test/SemaCXX/cxx0x-class.cpp
+++ b/clang/test/SemaCXX/cxx0x-class.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -Wno-uninitialized -fsyntax-only -verify -std=c++11 -Wno-error=static-float-init %s 
 
-int vs = 0;
+int vs = 0; // expected-note {{declared here}}
 
 class C {
 public:
@@ -11,17 +11,20 @@ class C {
   int i = 0;
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                             // expected-note {{read of non-const variable 'vs' is not allowed in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // expected-error {{static const volatile data member must be initialized out of line}}
 };
 
 namespace rdar8367341 {
-  float foo(); // expected-note {{here}}
+  float foo(); // expected-note 2 {{here}}
 
   struct A {
     static const float x = 5.0f; // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
-    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
+    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}} \
+                                  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                                  // expected-note {{non-constexpr function 'foo' cannot be used in a constant expression}}
     static constexpr float x2 = 5.0f;
     static constexpr float y2 = foo(); // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr function 'foo'}}
   };
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index d9932e4dd8241..1474c48cda3c1 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -1154,20 +1154,20 @@ namespace GH65985 {
 int consteval operator""_foo(unsigned long long V) {
     return 0;
 }
-int consteval operator""_bar(unsigned long long V); // expected-note 3{{here}}
+int consteval operator""_bar(unsigned long long V); // expected-note 4 {{here}}
 
 int consteval f() {
   return 0;
 }
 
-int consteval g();  // expected-note {{here}}
+int consteval g();  // expected-note 2 {{here}}
 
 
 struct C {
     static const int a = 1_foo;
     static constexpr int b = 1_foo;
     static const int c = 1_bar; // expected-error {{call to consteval function 'GH65985::operator""_bar' is not a constant expression}} \
-                                // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
+                                // expected-note 2 {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
                                 // expected-error {{in-class initializer for static data member is not a constant expression}}
 
     // FIXME: remove duplicate diagnostics
@@ -1179,7 +1179,7 @@ struct C {
     static const int e = f();
     static const int f = g(); // expected-error {{call to consteval function 'GH65985::g' is not a constant expression}} \
                               // expected-error {{in-class initializer for static data member is not a constant expression}} \
-                              // expected-note  {{undefined function 'g' cannot be used in a constant expression}}
+                              // expected-note 2 {{undefined function 'g' cannot be used in a constant expression}}
 };
 
 }
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 9d43994ee7661..7152a5937d9b7 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-intrinsics -fptrauth-calls -std=c++2c -verify %s
 
 class Trivial {};
 static_assert(__builtin_is_cpp_trivially_relocatable(Trivial));
diff --git a/clang/test/SemaCXX/ptrauth-triviality.cpp b/clang/test/SemaCXX/ptrauth-triviality.cpp
index 60d1b57230f18..ba8a8273d5c05 100644
--- a/clang/test/SemaCXX/ptrauth-triviality.cpp
+++ b/clang/test/SemaCXX/ptrauth-triviality.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
 
 #define AQ __ptrauth(1,1,50)
 #define IQ __ptrauth(1,0,50)
@@ -83,7 +83,7 @@ static_assert(!__is_trivially_constructible(Holder<S3>, const Holder<S3>&));
 static_assert(!__is_trivially_assignable(Holder<S3>, const Holder<S3>&));
 static_assert(__is_trivially_destructible(Holder<S3>));
 static_assert(!__is_trivially_copyable(Holder<S3>));
-static_assert(__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S3>));
 static_assert(!__is_trivially_equality_comparable(Holder<S3>));
 
@@ -99,7 +99,6 @@ static_assert(!__is_trivially_assignable(S4, const S4&));
 static_assert(__is_trivially_destructible(S4));
 static_assert(!__is_trivially_copyable(S4));
 static_assert(!__is_trivially_relocatable(S4)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S4));
 static_assert(!__is_trivially_equality_comparable(S4));
 
@@ -124,7 +123,6 @@ static_assert(!__is_trivially_assignable(S5, const S5&));
 static_assert(__is_trivially_destructible(S5));
 static_assert(!__is_trivially_copyable(S5));
 static_assert(!__is_trivially_relocatable(S5)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S5));
 static_assert(!__is_trivially_equality_comparable(S5));
 
@@ -182,3 +180,39 @@ static_assert(__is_trivially_copyable(Holder<S7>));
 static_assert(__is_trivially_relocatable(Holder<S7>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S7>));
 static_assert(__is_trivially_equality_comparable(Holder<S7>));
+
+template <class... Bases> struct MultipleInheriter : Bases... {
+};
+
+template <class T> static const bool test_is_trivially_relocatable_v = __builtin_is_cpp_trivially_relocatable(T);
+template <class... Types> static const bool multiple_inheritance_is_relocatable = test_is_trivially_relocatable_v<MultipleInheriter<Types...>>;
+template <class... Types> static const bool inheritance_relocatability_matches_bases_v =
+  (test_is_trivially_relocatable_v<Types> && ...) == multiple_inheritance_is_relocatable<Types...>;
+
+static_assert(multiple_inheritance_is_relocatable<S4, S5> == multiple_inheritance_is_relocatable<S5, S4>);
+static_assert(inheritance_relocatability_matches_bases_v<S4, S5>);
+static_assert(inheritance_relocatability_matches_bases_v<S5, S4>);
+
+struct AA AddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void foo();
+};
+
+struct IA NoAddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void bar();
+};
+
+template <class T> struct UnionWrapper trivially_relocatable_if_eligible {
+  union U {
+    T field1;
+  } u;
+};
+
+static_assert(test_is_trivially_relocatable_v<AddressDiscriminatedPolymorphicBase>);
+static_assert(test_is_trivially_relocatable_v<NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>);
+
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<AddressDiscriminatedPolymorphicBase>>);
+static_assert(test_is_trivially_relocatable_v<UnionWrapper<NoAddressDiscriminatedPolymorphicBase>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>>>);
diff --git a/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
new file mode 100644
index 0000000000000..b38499a634fcf
--- /dev/null
+++ b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple arm64 -fptrauth-calls -fptrauth-intrinsics -std=c++26 -verify %s
+
+// This test intentionally does not enable the global address discrimination
+// of vtable pointers. This lets us configure them with different schemas
+// and verify that we're correctly tracking the existence of address discrimination
+
+// expected-no-diagnostics
+
+struct NonAddressDiscPtrauth {
+  void * __ptrauth(1, 0, 1234) p;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscPtrauth));
+
+struct AddressDiscPtrauth {
+  void * __ptrauth(1, 1, 1234) p;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscPtrauth));
+
+struct MultipleBaseClasses : NonAddressDiscPtrauth, AddressDiscPtrauth {
+
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleBaseClasses));
+
+struct MultipleMembers1 {
+   NonAddressDiscPtrauth field0;
+   AddressDiscPtrauth field1;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleMembers1));
+
+struct MultipleMembers2 {
+   NonAddressDiscPtrauth field0;
+   NonAddressDiscPtrauth field1;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(MultipleMembers2));
+
+struct UnionOfPtrauth {
+    union {
+        NonAddressDiscPtrauth field0;
+        AddressDiscPtrauth field1;
+    } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPtrauth));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,address_discrimination,no_extra_discrimination)]] Polymorphic trivially_relocatable_if_eligible {
+  virtual ~Polymorphic();
+};
+
+struct Foo : Polymorphic {
+  Foo(const Foo&);
+  ~Foo();
+};
+
+
+static_assert(__builtin_is_cpp_trivially_relocatable(Polymorphic));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,no_address_discrimination,no_extra_discrimination)]] NonAddressDiscriminatedPolymorphic trivially_relocatable_if_eligible {
+  virtual ~NonAddressDiscriminatedPolymorphic();
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscriminatedPolymorphic));
+
+
+struct PolymorphicMembers {
+    Polymorphic field;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(PolymorphicMembers));
+
+struct UnionOfPolymorphic {
+  union trivially_relocatable_if_eligible {
+    Polymorphic p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic));
+
+
+struct UnionOfNonAddressDiscriminatedPolymorphic {
+  union trivially_relocatable_if_eligible {
+    NonAddressDiscriminatedPolymorphic p;
+    int i;
+  } u;
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPolymorphic));
+
+struct UnionOfNonAddressDiscriminatedPtrauth {
+  union {
+    NonAddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPtrauth));
+
+struct UnionOfAddressDisriminatedPtrauth {
+  union {
+    AddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfAddressDisriminatedPtrauth));
diff --git a/clang/test/SemaTemplate/instantiate-static-var.cpp b/clang/test/SemaTemplate/instantiate-static-var.cpp
index 63d8366b617c1..6602670af901f 100644
--- a/clang/test/SemaTemplate/instantiate-static-var.cpp
+++ b/clang/test/SemaTemplate/instantiate-static-var.cpp
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -std=c++11 %s
 
 template<typename T, T Divisor>
 class X {
 public:
-  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}}
+  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{division by zero}} \
+  // cxx98-note {{subexpression not valid}}
 };
 
 int array1[X<int, 2>::value == 5? 1 : -1];
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 7a1007d03737e..0f1fa8b329fd6 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -268,7 +268,8 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
 bool linkerSupportsLTO(const ArgList &Args) {
   llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   return Triple.isNVPTX() || Triple.isAMDGPU() ||
-         Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld");
+         (!Triple.isGPU() &&
+          Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld"));
 }
 
 /// Returns the hashed value for a constant string.
diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt
index 6859efed294c8..2ed43a83b8782 100644
--- a/clang/unittests/Parse/CMakeLists.txt
+++ b/clang/unittests/Parse/CMakeLists.txt
@@ -11,5 +11,6 @@ add_clang_unittest(ParseTests
   LLVMTestingSupport
   clangTesting
   LLVM_COMPONENTS
+  FrontendHLSL
   Support
   )
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 42627f02cf356..f892626a447e5 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3739,7 +3739,8 @@ static void GenerateHasAttrSpellingStringSwitch(
                       : '(' + itostr(Version) + ')';
 
     if (Scope.empty() || Scope == Spelling.nameSpace()) {
-      if (TestStringMap.contains(Spelling.name()))
+      if (TestStringMap.contains(Spelling.name()) &&
+          TestStringMap[Spelling.name()] != TestStr)
         TestStringMap[Spelling.name()] += " || " + TestStr;
       else
         TestStringMap[Spelling.name()] = TestStr;
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 075c4647abf69..5e832315f3666 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -884,7 +884,11 @@ else ()
   if(COMPILER_RT_DISABLE_AARCH64_FMV)
     list(APPEND BUILTIN_DEFS DISABLE_AARCH64_FMV)
   elseif(COMPILER_RT_BAREMETAL_BUILD)
-    list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+    foreach (arch ${BUILTIN_SUPPORTED_ARCH})
+      if("${arch}" MATCHES "arm64|aarch64")
+        list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+      endif()
+    endforeach ()
   endif()
 
   append_list_if(COMPILER_RT_HAS_ASM_LSE HAS_ASM_LSE BUILTIN_DEFS)
diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp
index 76255cdb742a3..f94d3cb79aa00 100644
--- a/compiler-rt/lib/msan/msan_interceptors.cpp
+++ b/compiler-rt/lib/msan/msan_interceptors.cpp
@@ -1127,7 +1127,7 @@ static void SignalAction(int signo, void *si, void *uc) {
   SignalHandlerScope signal_handler_scope;
   ScopedThreadLocalStateBackup stlsb;
   UnpoisonParam(3);
-  __msan_unpoison(si, sizeof(__sanitizer_sigaction));
+  __msan_unpoison(si, sizeof(__sanitizer_siginfo));
   __msan_unpoison(uc, ucontext_t_sz(uc));
 
   typedef void (*sigaction_cb)(int, void *, void *);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 9f5f41cd85514..4c8d9a9b86bed 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -14,7 +14,8 @@
 
 #if !defined(__linux__) && !defined(__FreeBSD__) && !defined(__NetBSD__) && \
     !defined(__APPLE__) && !defined(_WIN32) && !defined(__Fuchsia__) &&     \
-    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__)
+    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__) &&      \
+    !defined(__wasi__)
 #  error "This operating system is not supported"
 #endif
 
@@ -61,6 +62,12 @@
 #  define SANITIZER_HAIKU 0
 #endif
 
+#if defined(__wasi__)
+#  define SANITIZER_WASI 1
+#else
+#  define SANITIZER_WASI 0
+#endif
+
 // - SANITIZER_APPLE: all Apple code
 //   - TARGET_OS_OSX: macOS
 //   - SANITIZER_IOS: devices (iOS and iOS-like)
diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 16258b3bbba9b..e579f6012ce86 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,6 +64,9 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
+  enum InternalDebugging { WorkQueue = 1 };
+  int internalDebugging{0}; // FLANG_RT_DEBUG
+
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index 070d0bf8673fb..dc372de53506a 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes
+  // Interoperable STAT= codes (>= 11)
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values
+  // Standard STAT= values (>= 101)
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,10 +49,14 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
+
+  // Dummy status for work queue continuation, declared here to perhaps
+  // avoid collisions
+  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 5e79efde164f2..80301a313282f 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -154,12 +154,17 @@ class SpecialBinding {
   RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
-  RT_API_ATTRS bool isTypeBound() const { return isTypeBound_; }
+  RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; }
   RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
     return (isArgContiguousSet_ >> zeroBasedArg) & 1;
   }
-  template <typename PROC> RT_API_ATTRS PROC GetProc() const {
-    return reinterpret_cast<PROC>(proc_);
+  template <typename PROC>
+  RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const {
+    if (bindings && isTypeBound_ > 0) {
+      return reinterpret_cast<PROC>(bindings[isTypeBound_ - 1].proc);
+    } else {
+      return reinterpret_cast<PROC>(proc_);
+    }
   }
 
   FILE *Dump(FILE *) const;
@@ -193,6 +198,8 @@ class SpecialBinding {
   //     When false, the defined I/O subroutine must have been
   //     called via a generic interface, not a generic TBP.
   std::uint8_t isArgDescriptorSet_{0};
+  // When a special binding is type-bound, this is its binding's index (plus 1,
+  // so that 0 signifies that it's not type-bound).
   std::uint8_t isTypeBound_{0};
   // True when a FINAL subroutine has a dummy argument that is an array that
   // is CONTIGUOUS or neither assumed-rank nor assumed-shape.
@@ -240,6 +247,7 @@ class DerivedType {
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
+  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -322,6 +330,7 @@ class DerivedType {
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
+  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
new file mode 100644
index 0000000000000..0daa7bc4d3384
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -0,0 +1,555 @@
+//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Internal runtime utilities for work queues that replace the use of recursion
+// for better GPU device support.
+//
+// A work queue comprises a list of tickets.  Each ticket class has a Begin()
+// member function, which is called once, and a Continue() member function
+// that can be called zero or more times.  A ticket's execution terminates
+// when either of these member functions returns a status other than
+// StatContinue.  When that status is not StatOk, then the whole queue
+// is shut down.
+//
+// By returning StatContinue from its Continue() member function,
+// a ticket suspends its execution so that any nested tickets that it
+// may have created can be run to completion.  It is the reponsibility
+// of each ticket class to maintain resumption information in its state
+// and manage its own progress.  Most ticket classes inherit from
+// class ComponentsOverElements, which implements an outer loop over all
+// components of a derived type, and an inner loop over all elements
+// of a descriptor, possibly with multiple phases of execution per element.
+//
+// Tickets are created by WorkQueue::Begin...() member functions.
+// There is one of these for each "top level" recursive function in the
+// Fortran runtime support library that has been restructured into this
+// ticket framework.
+//
+// When the work queue is running tickets, it always selects the last ticket
+// on the list for execution -- "work stack" might have been a more accurate
+// name for this framework.  This ticket may, while doing its job, create
+// new tickets, and since those are pushed after the active one, the first
+// such nested ticket will be the next one executed to completion -- i.e.,
+// the order of nested WorkQueue::Begin...() calls is respected.
+// Note that a ticket's Continue() member function won't be called again
+// until all nested tickets have run to completion and it is once again
+// the last ticket on the queue.
+//
+// Example for an assignment to a derived type:
+// 1. Assign() is called, and its work queue is created.  It calls
+//    WorkQueue::BeginAssign() and then WorkQueue::Run().
+// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
+//    BeginFinalize() and returns StatContinue.
+// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
+//    until one of them returns StatOk, which ends the finalization ticket.
+// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
+//    and then returns StatOk, which ends the ticket.
+// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
+//    and ::Continue() are called until they are done (not StatContinue).
+//    Along the way, it may create nested AssignTickets for components,
+//    and suspend itself so that they may each run to completion.
+
+#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
+#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
+
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
+#include <flang/Common/variant.h>
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
+
+namespace Fortran::runtime {
+class Terminator;
+class WorkQueue;
+
+// Ticket worker base classes
+
+template <typename TICKET> class ImmediateTicketRunner {
+public:
+  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
+      : ticket_{ticket} {}
+  RT_API_ATTRS int Run(WorkQueue &workQueue) {
+    int status{ticket_.Begin(workQueue)};
+    while (status == StatContinue) {
+      status = ticket_.Continue(workQueue);
+    }
+    return status;
+  }
+
+private:
+  TICKET &ticket_;
+};
+
+// Base class for ticket workers that operate elementwise over descriptors
+class Elementwise {
+public:
+  RT_API_ATTRS Elementwise(
+      const Descriptor &instance, const Descriptor *from = nullptr)
+      : instance_{instance}, from_{from} {
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
+  RT_API_ATTRS void Advance() {
+    ++elementAt_;
+    instance_.IncrementSubscripts(subscripts_);
+    if (from_) {
+      from_->IncrementSubscripts(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
+  RT_API_ATTRS void Reset() {
+    elementAt_ = 0;
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+
+protected:
+  const Descriptor &instance_, *from_{nullptr};
+  std::size_t elements_{instance_.Elements()};
+  std::size_t elementAt_{0};
+  SubscriptValue subscripts_[common::maxRank];
+  SubscriptValue fromSubscripts_[common::maxRank];
+};
+
+// Base class for ticket workers that operate over derived type components.
+class Componentwise {
+public:
+  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
+  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
+  RT_API_ATTRS void Advance() {
+    ++componentAt_;
+    GetComponent();
+  }
+  RT_API_ATTRS void SkipToEnd() {
+    component_ = nullptr;
+    componentAt_ = components_;
+  }
+  RT_API_ATTRS void Reset() {
+    component_ = nullptr;
+    componentAt_ = 0;
+    GetComponent();
+  }
+  RT_API_ATTRS void GetComponent();
+
+protected:
+  const typeInfo::DerivedType &derived_;
+  std::size_t components_{0}, componentAt_{0};
+  const typeInfo::Component *component_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
+};
+
+// Base class for ticket workers that operate over derived type components
+// in an outer loop, and elements in an inner loop.
+class ComponentsOverElements : public Componentwise, public Elementwise {
+public:
+  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Componentwise{derived}, Elementwise{instance, from} {
+    if (Elementwise::IsComplete()) {
+      Componentwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextElement();
+    if (Elementwise::IsComplete()) {
+      Elementwise::Reset();
+      Componentwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Elementwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void Reset() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Reset();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Base class for ticket workers that operate over elements in an outer loop,
+// type components in an inner loop.
+class ElementsOverComponents : public Elementwise, public Componentwise {
+public:
+  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Elementwise{instance, from}, Componentwise{derived} {
+    if (Componentwise::IsComplete()) {
+      Elementwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextComponent();
+    if (Componentwise::IsComplete()) {
+      Componentwise::Reset();
+      Elementwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Componentwise::Reset();
+    Elementwise::Advance();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Ticket worker classes
+
+// Implements derived type instance initialization
+class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
+                         private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<InitializeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+};
+
+// Initializes one derived type instance from the value of another
+class InitializeCloneTicket
+    : public ImmediateTicketRunner<InitializeCloneTicket>,
+      private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg)
+      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
+        ComponentsOverElements{original, derived}, clone_{clone},
+        hasStat_{hasStat}, errMsg_{errMsg} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const Descriptor &clone_;
+  bool hasStat_{false};
+  const Descriptor *errMsg_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
+};
+
+// Implements derived type instance finalization
+class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
+                       private ComponentsOverElements {
+public:
+  RT_API_ATTRS FinalizeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<FinalizeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const typeInfo::DerivedType *finalizableParentType_{nullptr};
+};
+
+// Implements derived type instance destruction
+class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
+                      private ComponentsOverElements {
+public:
+  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, bool finalize)
+      : ImmediateTicketRunner<DestroyTicket>{*this},
+        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  bool finalize_{false};
+};
+
+// Implements general intrinsic assignment
+class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
+public:
+  RT_API_ATTRS AssignTicket(Descriptor &to, const Descriptor &from, int flags,
+      MemmoveFct memmoveFct, const typeInfo::DerivedType *declaredType)
+      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
+        flags_{flags}, memmoveFct_{memmoveFct}, declaredType_{declaredType} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  RT_API_ATTRS bool IsSimpleMemmove() const {
+    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
+        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
+  }
+  RT_API_ATTRS Descriptor &GetTempDescriptor();
+
+  Descriptor &to_;
+  const Descriptor *from_{nullptr};
+  int flags_{0}; // enum AssignFlags
+  MemmoveFct memmoveFct_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
+  const typeInfo::DerivedType *declaredType_{nullptr};
+  const typeInfo::DerivedType *toDerived_{nullptr};
+  Descriptor *toDeallocate_{nullptr};
+  bool persist_{false};
+  bool done_{false};
+};
+
+// Implements derived type intrinsic assignment.
+template <bool IS_COMPONENTWISE>
+class DerivedAssignTicket
+    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+          ElementsOverComponents> {
+public:
+  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      ElementsOverComponents>;
+  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter)
+      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
+        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
+        deallocateAfter_{deallocateAfter} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
+  bool toIsContiguous_{this->instance_.IsContiguous()};
+  bool fromIsContiguous_{this->from_->IsContiguous()};
+  int flags_{0};
+  MemmoveFct memmoveFct_{nullptr};
+  Descriptor *deallocateAfter_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
+};
+
+namespace io::descr {
+
+template <io::Direction DIR>
+class DescriptorIoTicket
+    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
+      private Elementwise {
+public:
+  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
+        Elementwise{descriptor}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
+  const typeInfo::DerivedType *derived_{nullptr};
+  const typeInfo::SpecialBinding *special_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
+};
+
+template <io::Direction DIR>
+class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
+                        private ElementsOverComponents {
+public:
+  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DerivedIoTicket>(*this),
+        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+};
+
+} // namespace io::descr
+
+struct NullTicket {
+  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
+  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
+};
+
+struct Ticket {
+  RT_API_ATTRS int Continue(WorkQueue &);
+  bool begun{false};
+  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
+      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
+      DerivedAssignTicket<true>,
+      io::descr::DescriptorIoTicket<io::Direction::Output>,
+      io::descr::DescriptorIoTicket<io::Direction::Input>,
+      io::descr::DerivedIoTicket<io::Direction::Output>,
+      io::descr::DerivedIoTicket<io::Direction::Input>>
+      u;
+};
+
+class WorkQueue {
+public:
+  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
+      : terminator_{terminator} {
+    for (int j{1}; j < numStatic_; ++j) {
+      static_[j].previous = &static_[j - 1];
+      static_[j - 1].next = &static_[j];
+    }
+  }
+  RT_API_ATTRS ~WorkQueue();
+  RT_API_ATTRS Terminator &terminator() { return terminator_; };
+
+  // APIs for particular tasks.  These can return StatOk if the work is
+  // completed immediately.
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return InitializeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg) {
+    if (runTicketsImmediately_) {
+      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeCloneTicket>(
+          clone, original, derived, hasStat, errMsg);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginFinalize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return FinalizeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, bool finalize) {
+    if (runTicketsImmediately_) {
+      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
+    } else {
+      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
+      int flags, MemmoveFct memmoveFct,
+      const typeInfo::DerivedType *declaredType) {
+    if (runTicketsImmediately_) {
+      return AssignTicket{to, from, flags, memmoveFct, declaredType}.Run(*this);
+    } else {
+      StartTicket().u.emplace<AssignTicket>(
+          to, from, flags, memmoveFct, declaredType);
+      return StatContinue;
+    }
+  }
+  template <bool IS_COMPONENTWISE>
+  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter) {
+    if (runTicketsImmediately_) {
+      return DerivedAssignTicket<IS_COMPONENTWISE>{
+          to, from, derived, flags, memmoveFct, deallocateAfter}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
+          to, from, derived, flags, memmoveFct, deallocateAfter);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DescriptorIoTicket<DIR>{
+          io, descriptor, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
+          io, descriptor, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DerivedIoTicket<DIR>{
+          io, descriptor, derived, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
+          io, descriptor, derived, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+
+  RT_API_ATTRS int Run();
+
+private:
+#if RT_DEVICE_COMPILATION
+  // Always use the work queue on a GPU device to avoid recursion.
+  static constexpr bool runTicketsImmediately_{false};
+#else
+  // Avoid the work queue overhead on the host, unless it needs
+  // debugging, which is so much easier there.
+  static constexpr bool runTicketsImmediately_{true};
+#endif
+
+  // Most uses of the work queue won't go very deep.
+  static constexpr int numStatic_{2};
+
+  struct TicketList {
+    bool isStatic{true};
+    Ticket ticket;
+    TicketList *previous{nullptr}, *next{nullptr};
+  };
+
+  RT_API_ATTRS Ticket &StartTicket();
+  RT_API_ATTRS void Stop();
+
+  Terminator &terminator_;
+  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
+  TicketList static_[numStatic_];
+  TicketList *firstFree_{static_};
+};
+
+} // namespace Fortran::runtime
+#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index a3f63b4315644..332c0872e065f 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,6 +68,7 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
+  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -131,6 +132,7 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
+  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index ef18da6ea0786..f724f0a20884b 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -165,6 +165,26 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
       alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
+    if (alloc.rank() != source.rank() && source.rank() != 0) {
+      terminator.Crash("ALLOCATE object has rank %d while SOURCE= has rank %d",
+          alloc.rank(), source.rank());
+    }
+    if (int rank{source.rank()}; rank > 0) {
+      SubscriptValue allocExtent[maxRank], sourceExtent[maxRank];
+      alloc.GetShape(allocExtent);
+      source.GetShape(sourceExtent);
+      for (int j{0}; j < rank; ++j) {
+        if (allocExtent[j] != sourceExtent[j]) {
+          if (!hasStat) {
+            terminator.Crash("ALLOCATE object has extent %jd on dimension %d, "
+                             "but SOURCE= has extent %jd",
+                static_cast<std::intmax_t>(allocExtent[j]), j + 1,
+                static_cast<std::intmax_t>(sourceExtent[j]));
+          }
+          return StatInvalidExtent;
+        }
+      }
+    }
     DoFromSourceAssign(alloc, source, terminator);
   }
   return stat;
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index bf67b5dc8b645..f936a4192a33c 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -62,11 +63,24 @@ static inline RT_API_ATTRS bool MustDeallocateLHS(
     // Distinct shape? Deallocate
     int rank{to.rank()};
     for (int j{0}; j < rank; ++j) {
-      if (to.GetDimension(j).Extent() != from.GetDimension(j).Extent()) {
+      const auto &toDim{to.GetDimension(j)};
+      const auto &fromDim{from.GetDimension(j)};
+      if (toDim.Extent() != fromDim.Extent()) {
+        return true;
+      }
+      if ((flags & UpdateLHSBounds) &&
+          toDim.LowerBound() != fromDim.LowerBound()) {
         return true;
       }
     }
   }
+  // Not reallocating; may have to update bounds
+  if (flags & UpdateLHSBounds) {
+    int rank{to.rank()};
+    for (int j{0}; j < rank; ++j) {
+      to.GetDimension(j).SetLowerBound(from.GetDimension(j).LowerBound());
+    }
+  }
   return false;
 }
 
@@ -102,11 +116,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
-  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
-    result = ReturnError(terminator, Initialize(to, *derived, terminator));
-  }
-  return result;
+  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
 }
 
 // least <= 0, most >= 0
@@ -169,24 +179,27 @@ static RT_API_ATTRS bool MayAlias(const Descriptor &x, const Descriptor &y) {
 }
 
 static RT_API_ATTRS void DoScalarDefinedAssignment(const Descriptor &to,
-    const Descriptor &from, const typeInfo::SpecialBinding &special) {
+    const Descriptor &from, const typeInfo::DerivedType &derived,
+    const typeInfo::SpecialBinding &special) {
   bool toIsDesc{special.IsArgDescriptor(0)};
   bool fromIsDesc{special.IsArgDescriptor(1)};
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (toIsDesc) {
     if (fromIsDesc) {
-      auto *p{
-          special.GetProc<void (*)(const Descriptor &, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, const Descriptor &)>(
+          bindings)};
       p(to, from);
     } else {
-      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>(bindings)};
       p(to, from.raw().base_addr);
     }
   } else {
     if (fromIsDesc) {
-      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>(bindings)};
       p(to.raw().base_addr, from);
     } else {
-      auto *p{special.GetProc<void (*)(void *, void *)>()};
+      auto *p{special.GetProc<void (*)(void *, void *)>(bindings)};
       p(to.raw().base_addr, from.raw().base_addr);
     }
   }
@@ -208,7 +221,7 @@ static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
        to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
     toElementDesc.set_base_addr(to.Element<char>(toAt));
     fromElementDesc.set_base_addr(from.Element<char>(fromAt));
-    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, special);
+    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, derived, special);
   }
 }
 
@@ -231,6 +244,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -244,275 +259,461 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
-      MustDeallocateLHS(to, from, terminator, flags)};
-  DescriptorAddendum *toAddendum{to.Addendum()};
-  const typeInfo::DerivedType *toDerived{
-      toAddendum ? toAddendum->derivedType() : nullptr};
-  if (toDerived && (flags & NeedFinalization) &&
-      toDerived->noFinalizationNeeded()) {
-    flags &= ~NeedFinalization;
-  }
-  std::size_t toElementBytes{to.ElementBytes()};
-  std::size_t fromElementBytes{from.ElementBytes()};
-  // The following lambda definition violates the conding style,
-  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
-  auto isSimpleMemmove = [&]() {
-    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
-        from.IsContiguous() && toElementBytes == fromElementBytes;
-  };
-  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
-  Descriptor *deferDeallocation{nullptr};
-  if (MayAlias(to, from)) {
+  WorkQueue workQueue{terminator};
+  if (workQueue.BeginAssign(to, from, flags, memmoveFct, nullptr) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+}
+
+RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
+  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
+      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
+  DescriptorAddendum *toAddendum{to_.Addendum()};
+  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
+  if (toDerived_ && (flags_ & NeedFinalization) &&
+      toDerived_->noFinalizationNeeded()) {
+    flags_ &= ~NeedFinalization;
+  }
+  if (MayAlias(to_, *from_)) {
     if (mustDeallocateLHS) {
-      deferDeallocation = &deferredDeallocStatDesc.descriptor();
+      // Convert the LHS into a temporary, then make it look deallocated.
+      toDeallocate_ = &tempDescriptor_.descriptor();
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
-          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
-      to.set_base_addr(nullptr);
-    } else if (!isSimpleMemmove()) {
+          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
+      to_.set_base_addr(nullptr);
+      if (toDerived_ && (flags_ & NeedFinalization)) {
+        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+        flags_ &= ~NeedFinalization;
+      }
+    } else if (!IsSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from.SizeInBytes()};
-      StaticDescriptor<maxRank, true, 16> staticDesc;
-      Descriptor &newFrom{staticDesc.descriptor()};
-      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
+      auto descBytes{from_->SizeInBytes()};
+      Descriptor &newFrom{tempDescriptor_.descriptor()};
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
-      if (stat == StatOk) {
-        if (HasDynamicComponent(from)) {
-          // If 'from' has allocatable/automatic component, we cannot
-          // just make a shallow copy of the descriptor member.
-          // This will still leave data overlap in 'to' and 'newFrom'.
-          // For example:
-          //   type t
-          //     character, allocatable :: c(:)
-          //   end type t
-          //   type(t) :: x(3)
-          //   x(2:3) = x(1:2)
-          // We have to make a deep copy into 'newFrom' in this case.
-          RTNAME(AssignTemporary)
-          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
-        } else {
-          ShallowCopy(newFrom, from, true, from.IsContiguous());
+      if (int stat{ReturnError(
+              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
+          stat != StatOk) {
+        return stat;
+      }
+      if (HasDynamicComponent(*from_)) {
+        // If 'from' has allocatable/automatic component, we cannot
+        // just make a shallow copy of the descriptor member.
+        // This will still leave data overlap in 'to' and 'newFrom'.
+        // For example:
+        //   type t
+        //     character, allocatable :: c(:)
+        //   end type t
+        //   type(t) :: x(3)
+        //   x(2:3) = x(1:2)
+        // We have to make a deep copy into 'newFrom' in this case.
+        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
+          if (const auto *derived{addendum->derivedType()}) {
+            if (!derived->noInitializationNeeded()) {
+              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
+                  status != StatOk && status != StatContinue) {
+                return status;
+              }
+            }
+          }
         }
-        Assign(to, newFrom, terminator,
-            flags &
-                (NeedFinalization | ComponentCanBeDefinedAssignment |
-                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
-        newFrom.Deallocate();
+        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (int status{workQueue.BeginAssign(
+                newFrom, *from_, nestedFlags, memmoveFct_, nullptr)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+      } else {
+        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
       }
-      return;
+      from_ = &newFrom; // this is why from_ has to be a pointer
+      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
+          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
+      toDeallocate_ = &newFrom;
     }
   }
-  if (to.IsAllocatable()) {
+  if (to_.IsAllocatable()) {
     if (mustDeallocateLHS) {
-      if (deferDeallocation) {
-        if ((flags & NeedFinalization) && toDerived) {
-          Finalize(*deferDeallocation, *toDerived, &terminator);
-          flags &= ~NeedFinalization;
-        }
-      } else {
-        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
-            &terminator);
-        flags &= ~NeedFinalization;
+      if (!toDeallocate_ && to_.IsAllocated()) {
+        toDeallocate_ = &to_;
+      }
+    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
+      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
+                                   "assignment to unallocated allocatable",
+          to_.rank(), from_->rank());
+    }
+  } else if (!to_.IsAllocated()) {
+    workQueue.terminator().Crash(
+        "Assign: left-hand side variable is neither allocated nor allocatable");
+  }
+  if (toDerived_ && to_.IsAllocated()) {
+    // Schedule finalization or destruction of the LHS.
+    if (flags_ & NeedFinalization) {
+      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else if (!toDerived_->noDestructionNeeded()) {
+      if (int status{
+              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
-      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
-                       "unallocated allocatable",
-          to.rank(), from.rank());
     }
-    if (!to.IsAllocated()) {
-      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
-        return;
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
+  if (done_) {
+    // All child tickets are complete; can release this ticket's state.
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+    }
+    return StatOk;
+  }
+  // All necessary finalization or destruction that was initiated by Begin()
+  // has been completed.  Deallocation may be pending, and if it's for the LHS,
+  // do it now so that the LHS gets reallocated.
+  if (toDeallocate_ == &to_) {
+    toDeallocate_ = nullptr;
+    to_.Deallocate();
+  }
+  // Allocate the LHS if needed
+  if (!to_.IsAllocated()) {
+    if (int stat{
+            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
+        stat != StatOk) {
+      return stat;
+    }
+    const auto *addendum{to_.Addendum()};
+    toDerived_ = addendum ? addendum->derivedType() : nullptr;
+    if (toDerived_) {
+      if (!toDerived_->noInitializationNeeded()) {
+        if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
+            status != StatOk) {
+          return status;
+        }
       }
-      flags &= ~NeedFinalization;
-      toElementBytes = to.ElementBytes(); // may have changed
-      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
     }
   }
-  if (toDerived && (flags & CanBeDefinedAssignment)) {
-    // Check for a user-defined assignment type-bound procedure;
-    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
-    // the semantics, including allocatable (re)allocation and any
-    // finalization.
-    //
-    // Note that the aliasing and LHS (re)allocation handling above
-    // needs to run even with CanBeDefinedAssignment flag, when
-    // the Assign() is invoked recursively for component-per-component
-    // assignments.
-    if (to.rank() == 0) {
-      if (const auto *special{toDerived->FindSpecialBinding(
+  // Check for a user-defined assignment type-bound procedure;
+  // see 10.2.1.4-5.
+  // Note that the aliasing and LHS (re)allocation handling above
+  // needs to run even with CanBeDefinedAssignment flag, since
+  // Assign() can be invoked recursively for component-wise assignments.
+  // The declared type (if known) must be used for generic resolution
+  // of ASSIGNMENT(=) to a binding, but that binding can be overridden.
+  if (declaredType_ && (flags_ & CanBeDefinedAssignment)) {
+    if (to_.rank() == 0) {
+      if (const auto *special{declaredType_->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        return DoScalarDefinedAssignment(to, from, *special);
+        DoScalarDefinedAssignment(to_, *from_, *toDerived_, *special);
+        done_ = true;
+        return StatContinue;
       }
     }
-    if (const auto *special{toDerived->FindSpecialBinding(
+    if (const auto *special{declaredType_->FindSpecialBinding(
             typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
+      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
+      done_ = true;
+      return StatContinue;
     }
   }
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
-  // Scalar expansion of the RHS is implied by using the same empty
-  // subscript values on each (seemingly) elemental reference into
-  // "from".
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
-  std::size_t toElements{to.Elements()};
-  if (from.rank() > 0 && toElements != from.Elements()) {
-    terminator.Crash("Assign: mismatching element counts in array assignment "
-                     "(to %zd, from %zd)",
-        toElements, from.Elements());
+  // Intrinsic assignment
+  std::size_t toElements{to_.Elements()};
+  if (from_->rank() > 0 && toElements != from_->Elements()) {
+    workQueue.terminator().Crash("Assign: mismatching element counts in array "
+                                 "assignment (to %zd, from %zd)",
+        toElements, from_->Elements());
   }
-  if (to.type() != from.type()) {
-    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
-        to.type().raw(), from.type().raw());
+  if (to_.type() != from_->type()) {
+    workQueue.terminator().Crash(
+        "Assign: mismatching types (to code %d != from code %d)",
+        to_.type().raw(), from_->type().raw());
   }
-  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
-    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
-                     "bytes != from %zd bytes)",
+  std::size_t toElementBytes{to_.ElementBytes()};
+  std::size_t fromElementBytes{from_->ElementBytes()};
+  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
+    workQueue.terminator().Crash("Assign: mismatching non-character element "
+                                 "sizes (to %zd bytes != from %zd bytes)",
         toElementBytes, fromElementBytes);
   }
-  if (const typeInfo::DerivedType *
-      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
-    // Derived type intrinsic assignment, which is componentwise and elementwise
-    // for all components, including parent components (10.2.1.2-3).
-    // The target is first finalized if still necessary (7.5.6.3(1))
-    if (flags & NeedFinalization) {
-      Finalize(to, *updatedToDerived, &terminator);
-    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
-      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
-    }
-    // Copy the data components (incl. the parent) first.
-    const Descriptor &componentDesc{updatedToDerived->component()};
-    std::size_t numComponents{componentDesc.Elements()};
-    for (std::size_t j{0}; j < toElements;
-         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-      for (std::size_t k{0}; k < numComponents; ++k) {
-        const auto &comp{
-            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-                k)}; // TODO: exploit contiguity here
-        // Use PolymorphicLHS for components so that the right things happen
-        // when the components are polymorphic; when they're not, they're both
-        // not, and their declared types will match.
-        int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (flags & ComponentCanBeDefinedAssignment) {
-          nestedFlags |=
-              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-        }
-        switch (comp.genre()) {
-        case typeInfo::Component::Genre::Data:
-          if (comp.category() == TypeCategory::Derived) {
-            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
-            Descriptor &toCompDesc{statDesc[0].descriptor()};
-            Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
-            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
-          } else { // Component has intrinsic type; simply copy raw bytes
-            std::size_t componentByteSize{comp.SizeInBytes(to)};
-            memmoveFct(to.Element<char>(toAt) + comp.offset(),
-                from.Element<const char>(fromAt) + comp.offset(),
-                componentByteSize);
-          }
-          break;
-        case typeInfo::Component::Genre::Pointer: {
-          std::size_t componentByteSize{comp.SizeInBytes(to)};
-          memmoveFct(to.Element<char>(toAt) + comp.offset(),
-              from.Element<const char>(fromAt) + comp.offset(),
-              componentByteSize);
-        } break;
-        case typeInfo::Component::Genre::Allocatable:
-        case typeInfo::Component::Genre::Automatic: {
-          auto *toDesc{reinterpret_cast<Descriptor *>(
-              to.Element<char>(toAt) + comp.offset())};
-          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-              from.Element<char>(fromAt) + comp.offset())};
-          // Allocatable components of the LHS are unconditionally
-          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-          // unlike a "top-level" assignment to a variable, where
-          // deallocation is optional.
-          //
-          // Be careful not to destroy/reallocate the LHS, if there is
-          // overlap between LHS and RHS (it seems that partial overlap
-          // is not possible, though).
-          // Invoke Assign() recursively to deal with potential aliasing.
-          if (toDesc->IsAllocatable()) {
-            if (!fromDesc->IsAllocated()) {
-              // No aliasing.
-              //
-              // If to is not allocated, the Destroy() call is a no-op.
-              // This is just a shortcut, because the recursive Assign()
-              // below would initiate the destruction for to.
-              // No finalization is required.
-              toDesc->Destroy(
-                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
-              continue; // F'2018 10.2.1.3(13)(2)
-            }
-          }
-          // Force LHS deallocation with DeallocateLHS flag.
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
-        } break;
-        }
+  if (toDerived_) {
+    if (toDerived_->noDefinedAssignment()) { // componentwise
+      if (int status{workQueue.BeginDerivedAssign<true>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      // Copy procedure pointer components
-      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
-      std::size_t numProcPtrs{procPtrDesc.Elements()};
-      for (std::size_t k{0}; k < numProcPtrs; ++k) {
-        const auto &procPtr{
-            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
-                k)};
-        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
-            from.Element<const char>(fromAt) + procPtr.offset,
-            sizeof(typeInfo::ProcedurePointer));
+    } else { // elementwise
+      if (int status{workQueue.BeginDerivedAssign<false>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
     }
-  } else { // intrinsic type, intrinsic assignment
-    if (isSimpleMemmove()) {
-      memmoveFct(to.raw().base_addr, from.raw().base_addr,
-          toElements * toElementBytes);
-    } else if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to.type().raw()) {
+    toDeallocate_ = nullptr;
+  } else if (IsSimpleMemmove()) {
+    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
+        toElements * toElementBytes);
+  } else {
+    // Scalar expansion of the RHS is implied by using the same empty
+    // subscript values on each (seemingly) elemental reference into
+    // "from".
+    SubscriptValue toAt[maxRank];
+    to_.GetLowerBounds(toAt);
+    SubscriptValue fromAt[maxRank];
+    from_->GetLowerBounds(fromAt);
+    if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to_.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        terminator.Crash("unexpected type code %d in blank padded Assign()",
-            to.type().raw());
+        workQueue.terminator().Crash(
+            "unexpected type code %d in blank padded Assign()",
+            to_.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
+          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
+        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (deferDeallocation) {
-    // deferDeallocation is used only when LHS is an allocatable.
-    // The finalization has already been run for it.
-    deferDeallocation->Destroy(
-        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+  if (persist_) {
+    done_ = true;
+    return StatContinue;
+  } else {
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+      toDeallocate_ = nullptr;
+    }
+    return StatOk;
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
+    WorkQueue &workQueue) {
+  if (toIsContiguous_ && fromIsContiguous_ &&
+      this->derived_.noDestructionNeeded() &&
+      this->derived_.noDefinedAssignment() &&
+      this->instance_.rank() == this->from_->rank()) {
+    if (std::size_t elementBytes{this->instance_.ElementBytes()};
+        elementBytes == this->from_->ElementBytes()) {
+      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
+      // to be expanded, the types have the same size, and there are no
+      // allocatable components or defined ASSIGNMENT(=) at any level.
+      memmoveFct_(this->instance_.template OffsetElement<char>(),
+          this->from_->template OffsetElement<const char *>(),
+          this->instance_.Elements() * elementBytes);
+      return StatOk;
+    }
+  }
+  // Use PolymorphicLHS for components so that the right things happen
+  // when the components are polymorphic; when they're not, they're both
+  // not, and their declared types will match.
+  int nestedFlags{MaybeReallocate | PolymorphicLHS};
+  if (flags_ & ComponentCanBeDefinedAssignment) {
+    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+  }
+  flags_ = nestedFlags;
+  // Copy procedure pointer components
+  const Descriptor &procPtrDesc{this->derived_.procPtr()};
+  bool noDataComponents{this->IsComplete()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &procPtr{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        memmoveFct_(this->instance_.template ElementComponent<char>(
+                        this->subscripts_, procPtr.offset),
+            this->from_->template ElementComponent<const char>(
+                this->fromSubscripts_, procPtr.offset),
+            sizeof(typeInfo::ProcedurePointer));
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  if (noDataComponents) {
+    return StatOk;
+  }
+  return StatContinue;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
+
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
+    WorkQueue &workQueue) {
+  while (!this->IsComplete()) {
+    // Copy the data components (incl. the parent) first.
+    switch (this->component_->genre()) {
+    case typeInfo::Component::Genre::Data:
+      if (this->component_->category() == TypeCategory::Derived) {
+        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
+        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
+        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
+            workQueue.terminator(), this->subscripts_);
+        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
+            workQueue.terminator(), this->fromSubscripts_);
+        const auto *componentDerived{this->component_->derivedType()};
+        this->Advance();
+        if (int status{workQueue.BeginAssign(toCompDesc, fromCompDesc, flags_,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      } else { // Component has intrinsic type; simply copy raw bytes
+        std::size_t componentByteSize{
+            this->component_->SizeInBytes(this->instance_)};
+        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+          std::size_t offset{this->component_->offset()};
+          char *to{this->instance_.template OffsetElement<char>(offset)};
+          const char *from{
+              this->from_->template OffsetElement<const char>(offset)};
+          std::size_t toElementStride{this->instance_.ElementBytes()};
+          std::size_t fromElementStride{
+              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+          if (toElementStride == fromElementStride &&
+              toElementStride == componentByteSize) {
+            memmoveFct_(to, from, this->elements_ * componentByteSize);
+          } else {
+            for (std::size_t n{this->elements_}; n--;
+                to += toElementStride, from += fromElementStride) {
+              memmoveFct_(to, from, componentByteSize);
+            }
+          }
+          this->Componentwise::Advance();
+        } else {
+          memmoveFct_(
+              this->instance_.template Element<char>(this->subscripts_) +
+                  this->component_->offset(),
+              this->from_->template Element<const char>(this->fromSubscripts_) +
+                  this->component_->offset(),
+              componentByteSize);
+          this->Advance();
+        }
+      }
+      break;
+    case typeInfo::Component::Genre::Pointer: {
+      std::size_t componentByteSize{
+          this->component_->SizeInBytes(this->instance_)};
+      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+        std::size_t offset{this->component_->offset()};
+        char *to{this->instance_.template OffsetElement<char>(offset)};
+        const char *from{
+            this->from_->template OffsetElement<const char>(offset)};
+        std::size_t toElementStride{this->instance_.ElementBytes()};
+        std::size_t fromElementStride{
+            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+        if (toElementStride == fromElementStride &&
+            toElementStride == componentByteSize) {
+          memmoveFct_(to, from, this->elements_ * componentByteSize);
+        } else {
+          for (std::size_t n{this->elements_}; n--;
+              to += toElementStride, from += fromElementStride) {
+            memmoveFct_(to, from, componentByteSize);
+          }
+        }
+        this->Componentwise::Advance();
+      } else {
+        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
+                this->component_->offset(),
+            this->from_->template Element<const char>(this->fromSubscripts_) +
+                this->component_->offset(),
+            componentByteSize);
+        this->Advance();
+      }
+    } break;
+    case typeInfo::Component::Genre::Allocatable:
+    case typeInfo::Component::Genre::Automatic: {
+      auto *toDesc{reinterpret_cast<Descriptor *>(
+          this->instance_.template Element<char>(this->subscripts_) +
+          this->component_->offset())};
+      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+          this->from_->template Element<char>(this->fromSubscripts_) +
+          this->component_->offset())};
+      const auto *componentDerived{this->component_->derivedType()};
+      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
+        if (toDesc->IsAllocated()) {
+          if (this->phase_ == 0) {
+            this->phase_++;
+            if (componentDerived && !componentDerived->noDestructionNeeded()) {
+              if (int status{workQueue.BeginDestroy(
+                      *toDesc, *componentDerived, /*finalize=*/false)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
+          toDesc->Deallocate();
+        }
+        this->Advance();
+      } else {
+        // Allocatable components of the LHS are unconditionally
+        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+        // unlike a "top-level" assignment to a variable, where
+        // deallocation is optional.
+        int nestedFlags{flags_};
+        if (!componentDerived ||
+            (componentDerived->noFinalizationNeeded() &&
+                componentDerived->noInitializationNeeded() &&
+                componentDerived->noDestructionNeeded())) {
+          // The actual deallocation might be avoidable when the existing
+          // location can be reoccupied.
+          nestedFlags |= MaybeReallocate | UpdateLHSBounds;
+        } else {
+          // Force LHS deallocation with DeallocateLHS flag.
+          nestedFlags |= DeallocateLHS;
+        }
+        this->Advance();
+        if (int status{workQueue.BeginAssign(*toDesc, *fromDesc, nestedFlags,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } break;
+    }
+  }
+  if (deallocateAfter_) {
+    deallocateAfter_->Deallocate();
+  }
+  return StatOk;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -582,7 +783,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
-
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -599,7 +799,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 35037036f63e7..4e36b1e2edfc8 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,6 +12,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -30,180 +31,192 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
-    const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{instance.Elements()};
-  int stat{StatOk};
-  // Initialize data components in each element; the per-element iterations
-  // constitute the inner loops, not the outer ones
-  std::size_t myComponents{componentDesc.Elements()};
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &allocDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(allocDesc, instance, terminator);
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
+    const Descriptor *) {
+  WorkQueue workQueue{terminator};
+  int status{workQueue.BeginInitialize(instance, derived)};
+  return status == StatContinue ? workQueue.Run() : status;
+}
+
+RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived_.procPtr()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &comp{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
+            subscripts_, comp.offset)};
+        pptr = comp.procInitialization;
+      }
+    }
+    if (IsComplete()) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      // Establish allocatable descriptors
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            allocDesc, instance_, workQueue.terminator());
         allocDesc.raw().attribute = CFI_attribute_allocatable;
-        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
-          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
-              if (const auto *derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  stat = Initialize(
-                      allocDesc, *derived, terminator, hasStat, errMsg);
-                }
-              }
-            }
-          }
-          if (stat != StatOk) {
-            break;
-          }
-        }
       }
-    } else if (const void *init{comp.initialization()}) {
+      SkipToNextComponent();
+    } else if (const void *init{component_->initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{comp.SizeInBytes(instance)};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
+      std::size_t bytes{component_->SizeInBytes(instance_)};
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        char *ptr{instance_.ElementComponent<char>(
+            subscripts_, component_->offset())};
         std::memcpy(ptr, init, bytes);
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &ptrDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(ptrDesc, instance, terminator);
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            ptrDesc, instance_, workQueue.terminator());
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.  Recursive.
+      // data component.  Handles parent component's elements.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, instance);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
-        }
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginInitialize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived.procPtr()};
-  std::size_t myProcPtrs{procPtrDesc.Elements()};
-  for (std::size_t k{0}; k < myProcPtrs; ++k) {
-    const auto &comp{
-        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
-          at, comp.offset)};
-      pptr = comp.procInitialization;
-    }
-  }
-  return stat;
+  return StatOk;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    const Descriptor &original, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{orig.Elements()};
-  int stat{StatOk};
-
-  // Skip pointers and unallocated variables.
-  if (orig.IsPointer() || !orig.IsAllocated()) {
-    return stat;
+  if (original.IsPointer() || !original.IsAllocated()) {
+    return StatOk; // nothing to do
+  } else {
+    WorkQueue workQueue{terminator};
+    int status{workQueue.BeginInitializeClone(
+        clone, original, derived, hasStat, errMsg)};
+    return status == StatContinue ? workQueue.Run() : status;
   }
-  // Initialize each data component.
-  std::size_t components{componentDesc.Elements()};
-  for (std::size_t i{0}; i < components; ++i) {
-    const typeInfo::Component &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
-    SubscriptValue at[maxRank];
-    orig.GetLowerBounds(at);
-    // Allocate allocatable components that are also allocated in the original
-    // object.
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
-        Descriptor &origDesc{
-            *orig.ElementComponent<Descriptor>(at, comp.offset())};
-        Descriptor &cloneDesc{
-            *clone.ElementComponent<Descriptor>(at, comp.offset())};
-        if (origDesc.IsAllocated()) {
+}
+
+RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (origDesc.IsAllocated()) {
+        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        if (phase_ == 0) {
+          ++phase_;
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
-              if (const typeInfo::DerivedType *
-                  derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  // Perform default initialization for the allocated element.
-                  stat = Initialize(
-                      cloneDesc, *derived, terminator, hasStat, errMsg);
-                }
-                // Initialize derived type's allocatables.
-                if (stat == StatOk) {
-                  stat = InitializeClone(cloneDesc, origDesc, *derived,
-                      terminator, hasStat, errMsg);
+          if (int stat{ReturnError(workQueue.terminator(),
+                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
+              stat != StatOk) {
+            return stat;
+          }
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              if (!derived->noInitializationNeeded()) {
+                // Perform default initialization for the allocated element.
+                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
+                    status != StatOk) {
+                  return status;
                 }
               }
             }
           }
         }
-        if (stat != StatOk) {
-          break;
+        if (phase_ == 1) {
+          ++phase_;
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              // Initialize derived type's allocatables.
+              if (int status{workQueue.BeginInitializeClone(
+                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType()) {
-      // Handle nested derived types.
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, orig);
-      // Data components don't have descriptors, allocate them.
-      StaticDescriptor<maxRank, true, 0> origStaticDesc;
-      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
-      Descriptor &origDesc{origStaticDesc.descriptor()};
-      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (component_->derivedType()) {
+        // Handle nested derived types.
+        const typeInfo::DerivedType &compType{*component_->derivedType()};
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &origDesc{componentDescriptor_.descriptor()};
+        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
         origDesc.Establish(compType,
-            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
         cloneDesc.Establish(compType,
-            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = InitializeClone(
-            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
+            clone_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginInitializeClone(
+                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
+            status != StatOk) {
+          return status;
         }
+      } else {
+        SkipToNextComponent();
       }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
+      workQueue.Run();
     }
   }
-  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -221,7 +234,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
+    const typeInfo::DerivedType &derived, Terminator &terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -258,9 +271,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
-        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
@@ -284,87 +295,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  CallFinalSubroutine(descriptor, derived, terminator);
-  const auto *parentType{derived.GetParentType()};
-  bool recurse{parentType && !parentType->noFinalizationNeeded()};
+RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
+  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  for (auto k{recurse ? std::size_t{1}
-                      /* skip first component, it's the parent */
-                      : 0};
-       k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    descriptor.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
-        comp.category() == TypeCategory::Derived) {
+  finalizableParentType_ = derived_.GetParentType();
+  if (finalizableParentType_) {
+    if (finalizableParentType_->noFinalizationNeeded()) {
+      finalizableParentType_ = nullptr;
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
+        component_->category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        const Descriptor &compDesc{
-            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (compDesc.IsAllocated()) {
-          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *
-                compDynamicType{addendum->derivedType()}) {
-              if (!compDynamicType->noFinalizationNeeded()) {
-                Finalize(compDesc, *compDynamicType, terminator);
+      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
+          if (const typeInfo::DerivedType *compDynamicType{
+                  addendum->derivedType()}) {
+            if (!compDynamicType->noFinalizationNeeded()) {
+              if (int status{
+                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
+                  status != StatOk) {
+                return status;
               }
             }
           }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
-        if (!compType->noFinalizationNeeded()) {
-          for (std::size_t j{0}; j++ < elements;
-               descriptor.IncrementSubscripts(at)) {
-            const Descriptor &compDesc{
-                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-            if (compDesc.IsAllocated()) {
-              Finalize(compDesc, *compType, terminator);
-            }
+    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType *compType{component_->derivedType()};
+          compType && !compType->noFinalizationNeeded()) {
+        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        Advance();
+        if (compDesc.IsAllocated()) {
+          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
+              status != StatOk) {
+            return status;
           }
         }
+      } else {
+        SkipToNextComponent();
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Finalize(compDesc, compType, terminator);
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginFinalize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  if (recurse) {
-    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
-    Descriptor &tmpDesc{statDesc.descriptor()};
-    tmpDesc = descriptor;
+  // Last, do the parent component, if any and finalizable.
+  if (finalizableParentType_) {
+    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
+    tmpDesc = instance_;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(parentType);
-    tmpDesc.raw().elem_len = parentType->sizeInBytes();
-    Finalize(tmpDesc, *parentType, terminator);
+    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
+    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
+    const auto &parentType{*finalizableParentType_};
+    finalizableParentType_ = nullptr;
+    // Don't return StatOk here if the nested FInalize is still running;
+    // it needs this->componentDescriptor_.
+    return workQueue.BeginFinalize(tmpDesc, parentType);
   }
+  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -373,51 +391,71 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor,
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
-    return;
+  if (descriptor.IsAllocated() && !derived.noDestructionNeeded()) {
+    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
+      workQueue.Run();
+    }
   }
-  if (finalize && !derived.noFinalizationNeeded()) {
-    Finalize(descriptor, derived, terminator);
+}
+
+RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
+  if (finalize_ && !derived_.noFinalizationNeeded()) {
+    if (int status{workQueue.BeginFinalize(instance_, derived_)};
+        status != StatOk && status != StatContinue) {
+      return status;
+    }
   }
+  return StatContinue;
+}
+
+RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
   // Deallocate all direct and indirect allocatable and automatic components.
   // Contrary to finalization, the order of deallocation does not matter.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  SubscriptValue at[maxRank];
-  descriptor.GetLowerBounds(at);
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    const bool destroyComp{
-        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j < elements; ++j) {
-        Descriptor *d{
-            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (destroyComp) {
-          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
+  while (!IsComplete()) {
+    const auto *componentDerived{component_->derivedType()};
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      Descriptor *d{instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (d->IsAllocated()) {
+        if (phase_ == 0) {
+          ++phase_;
+          if (componentDerived && !componentDerived->noDestructionNeeded()) {
+            if (int status{workQueue.BeginDestroy(
+                    *d, *componentDerived, /*finalize=*/false)};
+                status != StatOk) {
+              return status;
+            }
+          }
         }
         d->Deallocate();
-        descriptor.IncrementSubscripts(at);
       }
-    } else if (destroyComp &&
-        comp.genre() == typeInfo::Component::Genre::Data) {
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (!componentDerived || componentDerived->noDestructionNeeded()) {
+        SkipToNextComponent();
+      } else {
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &compDesc{componentDescriptor_.descriptor()};
+        const typeInfo::DerivedType &compType{*componentDerived};
         compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginDestroy(
+                compDesc, *componentDerived, /*finalize=*/false)};
+            status != StatOk) {
+          return status;
+        }
       }
+    } else {
+      SkipToNextComponent();
     }
   }
+  return StatOk;
 }
 
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 3db1455af52fe..e7b99e6fc3a2b 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,15 +7,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
+
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
 // Defined formatted I/O (maybe)
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -65,10 +94,13 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
       // I/O subroutine reads counts towards READ(SIZE=).
       startPos = io.InquirePos();
     }
+    const auto *bindings{
+        derived.binding().OffsetElement<const typeInfo::Binding>()};
     if (special.IsArgDescriptor(0)) {
       // "dtv" argument is "class(t)", pass a descriptor
       auto *p{special.GetProc<void (*)(const Descriptor &, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
       Descriptor &elementDesc{elementStatDesc.descriptor()};
       elementDesc.Establish(
@@ -79,7 +111,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
     } else {
       // "dtv" argument is "type(t)", pass a raw pointer
       auto *p{special.GetProc<void (*)(const void *, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
           ioMsg, ioTypeLen, sizeof ioMsg);
     }
@@ -104,8 +137,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
 }
 
 // Defined unformatted I/O
-bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -121,10 +154,12 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
   descriptor.GetLowerBounds(subscripts);
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (special.IsArgDescriptor(0)) {
     // "dtv" argument is "class(t)", pass a descriptor
     auto *p{special.GetProc<void (*)(
-        const Descriptor &, int &, int &, char *, std::size_t)>()};
+        const Descriptor &, int &, int &, char *, std::size_t)>(bindings)};
     StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
     Descriptor &elementDesc{elementStatDesc.descriptor()};
     elementDesc.Establish(derived, nullptr, 0, nullptr, CFI_attribute_pointer);
@@ -137,8 +172,9 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
     }
   } else {
     // "dtv" argument is "type(t)", pass a raw pointer
-    auto *p{special.GetProc<void (*)(
-        const void *, int &, int &, char *, std::size_t)>()};
+    auto *p{special
+            .GetProc<void (*)(const void *, int &, int &, char *, std::size_t)>(
+                bindings)};
     for (; numElements-- > 0; descriptor.IncrementSubscripts(subscripts)) {
       p(descriptor.Element<char>(subscripts), unit, ioStat, ioMsg,
           sizeof ioMsg);
@@ -152,5 +188,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Data) {
+      // Create a descriptor for the component
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      component_->CreatePointerDescriptor(
+          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
+      Advance();
+      if (int status{workQueue.BeginDescriptorIo<DIR>(
+              io_, compDesc, table_, anyIoTookPlace_)};
+          status != StatOk) {
+        return status;
+      }
+    } else {
+      // Component is itself a descriptor
+      char *pointer{
+          instance_.Element<char>(subscripts_) + component_->offset()};
+      const Descriptor &compDesc{
+          *reinterpret_cast<const Descriptor *>(pointer)};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (int status{workQueue.BeginDescriptorIo<DIR>(
+                io_, compDesc, table_, anyIoTookPlace_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
+  IoErrorHandler &handler{io_.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return handler.GetIoStat();
+  }
+  if (!io_.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return handler.GetIoStat();
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io_.BeginReadingRecord()) {
+      return StatOk;
+    }
+  }
+  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
+    // Unformatted I/O
+    IoErrorHandler &handler{io_.GetIoErrorHandler()};
+    const DescriptorAddendum *addendum{instance_.Addendum()};
+    if (const typeInfo::DerivedType *type{
+            addendum ? addendum->derivedType() : nullptr}) {
+      // derived type unformatted I/O
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*type,
+                DIR == Direction::Input
+                    ? common::DefinedIo::ReadUnformatted
+                    : common::DefinedIo::WriteUnformatted)}) {
+          if (definedIo->subroutine) {
+            typeInfo::SpecialBinding special{DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false};
+            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
+              anyIoTookPlace_ = true;
+              return StatOk;
+            }
+          } else {
+            int status{workQueue.BeginDerivedIo<DIR>(
+                io_, instance_, *type, table_, anyIoTookPlace_)};
+            return status == StatContinue ? StatOk : status; // done here
+          }
+        }
+      }
+      if (const typeInfo::SpecialBinding *special{
+              type->FindSpecialBinding(DIR == Direction::Input
+                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+        if (!table_ || !table_->ignoreNonTbpEntries || special->IsTypeBound()) {
+          // defined derived type unformatted I/O
+          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
+            anyIoTookPlace_ = true;
+            return StatOk;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+      // Default derived type unformatted I/O
+      // TODO: If no component at any level has defined READ or WRITE
+      // (as appropriate), the elements are contiguous, and no byte swapping
+      // is active, do a block transfer via the code below.
+      int status{workQueue.BeginDerivedIo<DIR>(
+          io_, instance_, *type, table_, anyIoTookPlace_)};
+      return status == StatContinue ? StatOk : status; // done here
+    } else {
+      // intrinsic type unformatted I/O
+      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
+      InquireIOLengthState *inq{nullptr};
+      bool swapEndianness{false};
+      if (externalUnf) {
+        swapEndianness = externalUnf->unit().swapEndianness();
+      } else {
+        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
+        if (!childUnf) {
+          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
+                                         : nullptr;
+          RUNTIME_CHECK(handler, inq != nullptr);
+        }
+      }
+      std::size_t elementBytes{instance_.ElementBytes()};
+      std::size_t swappingBytes{elementBytes};
+      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
+        // Byte swapping units can be smaller than elements, namely
+        // for COMPLEX and CHARACTER.
+        if (maybeCatAndKind->first == TypeCategory::Character) {
+          // swap each character position independently
+          swappingBytes = maybeCatAndKind->second; // kind
+        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+          // swap real and imaginary components independently
+          swappingBytes /= 2;
+        }
+      }
+      using CharType =
+          std::conditional_t<DIR == Direction::Output, const char, char>;
+      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+        if constexpr (DIR == Direction::Output) {
+          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                             : inq->Emit(&x, totalBytes, swappingBytes);
+        } else {
+          return externalUnf
+              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+              : childUnf->Receive(&x, totalBytes, swappingBytes);
+        }
+      }};
+      if (!swapEndianness &&
+          instance_.IsContiguous()) { // contiguous unformatted I/O
+        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+        if (Transfer(x, elements_ * elementBytes)) {
+          anyIoTookPlace_ = true;
+        } else {
+          return IostatEnd;
+        }
+      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+        for (; !IsComplete(); Advance()) {
+          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+          if (Transfer(x, elementBytes)) {
+            anyIoTookPlace_ = true;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+    }
+    // Unformatted I/O never needs to call Continue().
+    return StatOk;
+  }
+  // Formatted I/O
+  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    bool any{false};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        any = FormattedRealIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedRealIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedRealIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedRealIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedRealIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedRealIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        any = FormattedComplexIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedComplexIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedComplexIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedComplexIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedComplexIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedComplexIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        any = FormattedCharacterIO<char, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        any = FormattedLogicalIO<1, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedLogicalIO<2, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedLogicalIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedLogicalIO<8, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Derived: {
+      // Derived type information must be present for formatted I/O.
+      IoErrorHandler &handler{io_.GetIoErrorHandler()};
+      const DescriptorAddendum *addendum{instance_.Addendum()};
+      RUNTIME_CHECK(handler, addendum != nullptr);
+      derived_ = addendum->derivedType();
+      RUNTIME_CHECK(handler, derived_ != nullptr);
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*derived_,
+                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                        : common::DefinedIo::WriteFormatted)}) {
+          if (definedIo->subroutine) {
+            nonTbpSpecial_.emplace(DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadFormatted
+                    : typeInfo::SpecialBinding::Which::WriteFormatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false);
+            special_ = &*nonTbpSpecial_;
+          }
+        }
+      }
+      if (!special_) {
+        if (const typeInfo::SpecialBinding *binding{
+                derived_->FindSpecialBinding(DIR == Direction::Input
+                        ? typeInfo::SpecialBinding::Which::ReadFormatted
+                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+          if (!table_ || !table_->ignoreNonTbpEntries ||
+              binding->IsTypeBound()) {
+            special_ = binding;
+          }
+        }
+      }
+      return StatContinue;
+    }
+    }
+    if (any) {
+      anyIoTookPlace_ = true;
+    } else {
+      return IostatEnd;
+    }
+  } else {
+    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+        static_cast<int>(instance_.type().raw()));
+    return handler.GetIoStat();
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  // Only derived type formatted I/O gets here.
+  while (!IsComplete()) {
+    if (special_) {
+      if (auto defined{DefinedFormattedIo(
+              io_, instance_, *derived_, *special_, subscripts_)}) {
+        anyIoTookPlace_ |= *defined;
+        Advance();
+        continue;
+      }
+    }
+    Descriptor &elementDesc{elementDescriptor_.descriptor()};
+    elementDesc.Establish(
+        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
+    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
+    Advance();
+    if (int status{workQueue.BeginDerivedIo<DIR>(
+            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
+        status != StatOk) {
+      return status;
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  bool anyIoTookPlace{false};
+  WorkQueue workQueue{io.GetIoErrorHandler()};
+  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+  return anyIoTookPlace;
+}
+
+template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index eb60f106c9203..88ad59bd24b53 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,619 +9,27 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
+#include "flang-rt/runtime/connection.h"
 
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/optional.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
+namespace Fortran::runtime {
+class Descriptor;
+} // namespace Fortran::runtime
 
-namespace Fortran::runtime::io::descr {
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
 
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io::descr {
 
 template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-// For intrinsic (not defined) derived type I/O, formatted & unformatted
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
-    const typeInfo::Component &component, const Descriptor &origDescriptor,
-    const SubscriptValue origSubscripts[], Terminator &terminator,
-    const NonTbpDefinedIoTable *table) {
-#if !defined(RT_DEVICE_AVOID_RECURSION)
-  if (component.genre() == typeInfo::Component::Genre::Data) {
-    // Create a descriptor for the component
-    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
-    Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
-        desc, origDescriptor, terminator, origSubscripts);
-    return DescriptorIO<DIR>(io, desc, table);
-  } else {
-    // Component is itself a descriptor
-    char *pointer{
-        origDescriptor.Element<char>(origSubscripts) + component.offset()};
-    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
-    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
-  }
-#else
-  terminator.Crash("not yet implemented: component IO");
-#endif
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  SubscriptValue at[maxRank];
-  compArray.GetLowerBounds(at);
-  for (std::size_t k{0}; k < numComponents;
-       ++k, compArray.IncrementSubscripts(at)) {
-    const typeInfo::Component &component{
-        *compArray.Element<typeInfo::Component>(at)};
-    if (!DefaultComponentIO<DIR>(
-            io, component, descriptor, subscripts, handler, table)) {
-      // Return true for NAMELIST input if any component appeared.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && k > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    SubscriptValue at[maxRank];
-    compArray.GetLowerBounds(at);
-    for (std::size_t k{0}; k < numComponents;
-         ++k, compArray.IncrementSubscripts(at)) {
-      const typeInfo::Component &component{
-          *compArray.Element<typeInfo::Component>(at)};
-      if (!DefaultComponentIO<DIR>(
-              io, component, descriptor, subscripts, handler, table)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
-    const typeInfo::SpecialBinding &, const SubscriptValue[]);
-
-template <Direction DIR>
-static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  // Derived type information must be present for formatted I/O.
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  RUNTIME_CHECK(handler, addendum != nullptr);
-  const typeInfo::DerivedType *type{addendum->derivedType()};
-  RUNTIME_CHECK(handler, type != nullptr);
-  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
-  const typeInfo::SpecialBinding *special{nullptr};
-  if (table) {
-    if (const auto *definedIo{table->Find(*type,
-            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                    : common::DefinedIo::WriteFormatted)}) {
-      if (definedIo->subroutine) {
-        nonTbpSpecial.emplace(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted,
-            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-            false);
-        special = &*nonTbpSpecial;
-      }
-    }
-  }
-  if (!special) {
-    if (const typeInfo::SpecialBinding *
-        binding{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
-        special = binding;
-      }
-    }
-  }
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t numElements{descriptor.Elements()};
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    Fortran::common::optional<bool> result;
-    if (special) {
-      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
-    }
-    if (!result) {
-      result = DefaultComponentwiseFormattedIO<DIR>(
-          io, descriptor, *type, table, subscripts);
-    }
-    if (!result.value()) {
-      // Return true for NAMELIST input if we got anything.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && j > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
 
-// Unformatted I/O
-template <Direction DIR>
-static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  if (const typeInfo::DerivedType *
-      type{addendum ? addendum->derivedType() : nullptr}) {
-    // derived type unformatted I/O
-    if (table) {
-      if (const auto *definedIo{table->Find(*type,
-              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
-                                      : common::DefinedIo::WriteUnformatted)}) {
-        if (definedIo->subroutine) {
-          typeInfo::SpecialBinding special{DIR == Direction::Input
-                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
-              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-              false};
-          if (Fortran::common::optional<bool> wasDefined{
-                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
-            return *wasDefined;
-          }
-        } else {
-          return DefaultComponentwiseUnformattedIO<DIR>(
-              io, descriptor, *type, table);
-        }
-      }
-    }
-    if (const typeInfo::SpecialBinding *
-        special{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
-        // defined derived type unformatted I/O
-        return DefinedUnformattedIo(io, descriptor, *type, *special);
-      }
-    }
-    // Default derived type unformatted I/O
-    // TODO: If no component at any level has defined READ or WRITE
-    // (as appropriate), the elements are contiguous, and no byte swapping
-    // is active, do a block transfer via the code below.
-    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
-  } else {
-    // intrinsic type unformatted I/O
-    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
-    auto *inq{
-        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
-    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
-    std::size_t elementBytes{descriptor.ElementBytes()};
-    std::size_t numElements{descriptor.Elements()};
-    std::size_t swappingBytes{elementBytes};
-    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
-      // Byte swapping units can be smaller than elements, namely
-      // for COMPLEX and CHARACTER.
-      if (maybeCatAndKind->first == TypeCategory::Character) {
-        // swap each character position independently
-        swappingBytes = maybeCatAndKind->second; // kind
-      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-        // swap real and imaginary components independently
-        swappingBytes /= 2;
-      }
-    }
-    SubscriptValue subscripts[maxRank];
-    descriptor.GetLowerBounds(subscripts);
-    using CharType =
-        std::conditional_t<DIR == Direction::Output, const char, char>;
-    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-      if constexpr (DIR == Direction::Output) {
-        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                           : inq->Emit(&x, totalBytes, swappingBytes);
-      } else {
-        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-                           : childUnf->Receive(&x, totalBytes, swappingBytes);
-      }
-    }};
-    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
-    if (!swapEndianness &&
-        descriptor.IsContiguous()) { // contiguous unformatted I/O
-      char &x{ExtractElement<char>(io, descriptor, subscripts)};
-      return Transfer(x, numElements * elementBytes);
-    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-      for (std::size_t j{0}; j < numElements; ++j) {
-        char &x{ExtractElement<char>(io, descriptor, subscripts)};
-        if (!Transfer(x, elementBytes)) {
-          return false;
-        }
-        if (!descriptor.IncrementSubscripts(subscripts) &&
-            j + 1 < numElements) {
-          handler.Crash("DescriptorIO: subscripts out of bounds");
-        }
-      }
-      return true;
-    }
-  }
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return false;
-  }
-  if (!io.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return false;
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io.BeginReadingRecord()) {
-      return false;
-    }
-  }
-  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
-    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
-  }
-  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        return FormattedRealIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedRealIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedRealIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedRealIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedRealIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedRealIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        return FormattedComplexIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedComplexIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedComplexIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedComplexIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedComplexIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedComplexIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        return FormattedCharacterIO<char, DIR>(io, descriptor);
-      case 2:
-        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
-      case 4:
-        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        return FormattedLogicalIO<1, DIR>(io, descriptor);
-      case 2:
-        return FormattedLogicalIO<2, DIR>(io, descriptor);
-      case 4:
-        return FormattedLogicalIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedLogicalIO<8, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Derived:
-      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
-    }
-  }
-  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-      static_cast<int>(descriptor.type().raw()));
-  return false;
-}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 1d5304254ed0e..0f0564403c0e2 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
+    internalDebugging = std::strtol(x, nullptr, 10);
+  }
+
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index b0cf2180fc6d4..1bef387a9771f 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,6 +10,7 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index b08195cd31e05..24d05f369fcbe 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 82182696d70c6..d023c3392d559 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
+  std::size_t offset{offset_};
   if (subscripts) {
-    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
-  } else {
-    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
+    offset += container.SubscriptsToByteOffset(subscripts);
   }
+  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
@@ -279,6 +279,10 @@ FILE *Component::Dump(FILE *f) const {
   }
   std::fprintf(f, " category %d  kind %d  rank %d  offset 0x%zx\n", category_,
       kind_, rank_, static_cast<std::size_t>(offset_));
+  const auto &dtDesc{derivedType_.descriptor()};
+  if (dtDesc.raw().base_addr) {
+    std::fprintf(f, " derivedType_ %p\n", dtDesc.raw().base_addr);
+  }
   if (initialization_) {
     std::fprintf(f, " initialization @ %p:\n",
         reinterpret_cast<const void *>(initialization_));
@@ -325,7 +329,7 @@ FILE *SpecialBinding::Dump(FILE *f) const {
     break;
   }
   std::fprintf(f, "    isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_);
-  std::fprintf(f, "    isTypeBound: 0x%x\n", isTypeBound_);
+  std::fprintf(f, "    isTypeBound: %d\n", isTypeBound_);
   std::fprintf(f, "    isArgContiguousSet: 0x%x\n", isArgContiguousSet_);
   std::fprintf(f, "    proc: %p\n", reinterpret_cast<void *>(proc_));
   return f;
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
new file mode 100644
index 0000000000000..a508ecb637102
--- /dev/null
+++ b/flang-rt/lib/runtime/work-queue.cpp
@@ -0,0 +1,161 @@
+//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/work-queue.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/visit.h"
+
+namespace Fortran::runtime {
+
+#if !defined(RT_DEVICE_COMPILATION)
+// FLANG_RT_DEBUG code is disabled when false.
+static constexpr bool enableDebugOutput{false};
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
+    : derived_{derived}, components_{derived_.component().Elements()} {
+  GetComponent();
+}
+
+RT_API_ATTRS void Componentwise::GetComponent() {
+  if (IsComplete()) {
+    component_ = nullptr;
+  } else {
+    const Descriptor &componentDesc{derived_.component()};
+    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+        componentAt_);
+  }
+}
+
+RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
+  if (!begun) {
+    begun = true;
+    return common::visit(
+        [&workQueue](
+            auto &specificTicket) { return specificTicket.Begin(workQueue); },
+        u);
+  } else {
+    return common::visit(
+        [&workQueue](auto &specificTicket) {
+          return specificTicket.Continue(workQueue);
+        },
+        u);
+  }
+}
+
+RT_API_ATTRS WorkQueue::~WorkQueue() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+  while (firstFree_) {
+    TicketList *next{firstFree_->next};
+    if (!firstFree_->isStatic) {
+      FreeMemory(firstFree_);
+    }
+    firstFree_ = next;
+  }
+}
+
+RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
+  if (!firstFree_) {
+    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
+    firstFree_ = new (p) TicketList;
+    firstFree_->isStatic = false;
+  }
+  TicketList *newTicket{firstFree_};
+  if ((firstFree_ = newTicket->next)) {
+    firstFree_->previous = nullptr;
+  }
+  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
+  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
+    newTicket->previous->next = newTicket;
+  } else {
+    first_ = newTicket;
+  }
+  if ((newTicket->next = after)) {
+    after->previous = newTicket;
+  } else {
+    last_ = newTicket;
+  }
+  newTicket->ticket.begun = false;
+#if !defined(RT_DEVICE_COMPILATION)
+  if (enableDebugOutput &&
+      (executionEnvironment.internalDebugging &
+          ExecutionEnvironment::WorkQueue)) {
+    std::fprintf(stderr, "WQ: new ticket\n");
+  }
+#endif
+  return newTicket->ticket;
+}
+
+RT_API_ATTRS int WorkQueue::Run() {
+  while (last_) {
+    TicketList *at{last_};
+    insertAfter_ = last_;
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
+          at->ticket.begun ? "Continue" : "Begin");
+    }
+#endif
+    int stat{at->ticket.Continue(*this)};
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
+    }
+#endif
+    insertAfter_ = nullptr;
+    if (stat == StatOk) {
+      if (at->previous) {
+        at->previous->next = at->next;
+      } else {
+        first_ = at->next;
+      }
+      if (at->next) {
+        at->next->previous = at->previous;
+      } else {
+        last_ = at->previous;
+      }
+      if ((at->next = firstFree_)) {
+        at->next->previous = at;
+      }
+      at->previous = nullptr;
+      firstFree_ = at;
+    } else if (stat != StatContinue) {
+      Stop();
+      return stat;
+    }
+  }
+  return StatOk;
+}
+
+RT_API_ATTRS void WorkQueue::Stop() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+}
+
+RT_OFFLOAD_API_GROUP_END
+
+} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 3833e48be3dd6..6c148b1de6f82 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength";
+        << "OutputDescriptor() for InquireIoLength " << j;
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 78d871c593e1d..871749934810c 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,6 +858,16 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
+* Intrinsic assignment of arrays is defined elementally, and intrinsic
+  assignment of derived type components is defined componentwise.
+  However, when intrinsic assignment takes place for an array of derived
+  type, the order of the loop nesting is not defined.
+  Some compilers will loop over the elements, assigning all of the components
+  of each element before proceeding to the next element.
+  This compiler loops over all of the components, and assigns all of
+  the elements for each component before proceeding to the next component.
+  A program using defined assignment might be able to detect the difference.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 90e05ce3d5ca6..27a6ca4ebdb4e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2323,9 +2323,13 @@ def fir_DoLoopOp : region_Op<"do_loop", [AttrSizedOperandSegments,
   }];
 }
 
-def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterface, [
-    "getRegionInvocationBounds", "getEntrySuccessorRegions"]>, RecursiveMemoryEffects,
-    NoRegionArguments]> {
+def fir_IfOp
+    : region_Op<
+          "if", [DeclareOpInterfaceMethods<
+                     RegionBranchOpInterface, ["getRegionInvocationBounds",
+                                               "getEntrySuccessorRegions"]>,
+                 RecursiveMemoryEffects, NoRegionArguments,
+                 WeightedRegionBranchOpInterface]> {
   let summary = "if-then-else conditional operation";
   let description = [{
     Used to conditionally execute operations. This operation is the FIR
@@ -2342,7 +2346,8 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
     ```
   }];
 
-  let arguments = (ins I1:$condition);
+  let arguments = (ins I1:$condition,
+      OptionalAttr<DenseI32ArrayAttr>:$region_weights);
   let results = (outs Variadic<AnyType>:$results);
 
   let regions = (region
@@ -2371,6 +2376,11 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
 
     void resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
                            unsigned resultNum);
+
+    /// Returns the display name string for the region_weights attribute.
+    static constexpr llvm::StringRef getWeightsAttrAssemblyName() {
+      return "weights";
+    }
   }];
 }
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index c6a5150a85a4c..e3eed6aed8079 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -565,6 +565,7 @@ class ParseTreeDumper {
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
   NODE_ENUM(OmpTaskDependenceType, Value)
+  NODE(parser, OmpIndirectClause)
   NODE(parser, OmpIterationOffset)
   NODE(parser, OmpIteration)
   NODE(parser, OmpIterationVector)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 67405f88e09f2..61f97b855b0e5 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4300,6 +4300,12 @@ struct OmpHoldsClause {
   WRAPPER_CLASS_BOILERPLATE(OmpHoldsClause, common::Indirection<Expr>);
 };
 
+// Ref: [5.2: 209]
+struct OmpIndirectClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpIndirectClause, std::optional<ScalarLogicalExpr>);
+};
+
 // Ref: [5.2:72-73], in 4.5-5.1 it's scattered over individual directives
 // that allow the IF clause.
 //
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index bc80997a1bec2..7d198bdcc9e89 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,8 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6
+  DeallocateLHS = 1 << 6,
+  UpdateLHSBounds = 1 << 7,
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index b13370512e5cc..69375a83dec25 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,9 +182,12 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &derived);
+bool MayRequireFinalization(const DerivedTypeSpec &);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
+// Does this type have any defined assignment at any level (or any polymorphic
+// allocatable)?
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/include/flang/Support/OpenMP-features.h b/flang/include/flang/Support/OpenMP-features.h
index 1dd7ea560cc96..349cd19c1224f 100644
--- a/flang/include/flang/Support/OpenMP-features.h
+++ b/flang/include/flang/Support/OpenMP-features.h
@@ -42,6 +42,9 @@ void setOpenMPMacro(int version, FortranPredefinitions &predefinitions) {
   case 52:
     predefinitions.emplace_back("_OPENMP", "202111");
     break;
+  case 60:
+    predefinitions.emplace_back("_OPENMP", "202411");
+    break;
   case 11:
   default:
     predefinitions.emplace_back("_OPENMP", "199911");
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b5c8de8c2ce8b..bc8fc14bcaeb2 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -727,12 +727,15 @@ class TypeInfo {
   // Is the type inside a box?
   bool isBox() const { return inBox; }
 
+  bool isBoxChar() const { return inBoxChar; }
+
 private:
   void typeScan(mlir::Type type);
 
   std::optional<fir::CharacterType::LenType> charLen;
   llvm::SmallVector<int64_t> shape;
   bool inBox = false;
+  bool inBoxChar = false;
 };
 
 void TypeInfo::typeScan(mlir::Type ty) {
@@ -748,6 +751,9 @@ void TypeInfo::typeScan(mlir::Type ty) {
     typeScan(cty.getEleTy());
   } else if (auto cty = mlir::dyn_cast<fir::CharacterType>(ty)) {
     charLen = cty.getLen();
+  } else if (auto cty = mlir::dyn_cast<fir::BoxCharType>(ty)) {
+    inBoxChar = true;
+    typeScan(cty.getEleTy());
   } else if (auto hty = mlir::dyn_cast<fir::HeapType>(ty)) {
     typeScan(hty.getEleTy());
   } else if (auto pty = mlir::dyn_cast<fir::PointerType>(ty)) {
@@ -791,12 +797,6 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
   fir::FortranVariableFlagsAttr attrs;
   if (varAttrs != fir::FortranVariableFlagsEnum::None)
     attrs = fir::FortranVariableFlagsAttr::get(builder.getContext(), varAttrs);
-  llvm::SmallVector<mlir::Value> typeparams;
-  if (typeInfo.getCharLength().has_value()) {
-    mlir::Value charLen = builder.createIntegerConstant(
-        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
-    typeparams.push_back(charLen);
-  }
   mlir::Value shape;
   if (!typeInfo.isBox() && !typeInfo.getShape().empty()) {
     llvm::SmallVector<mlir::Value> extents;
@@ -805,11 +805,34 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
           builder.createIntegerConstant(loc, builder.getIndexType(), extent));
     shape = builder.create<fir::ShapeOp>(loc, extents);
   }
+  mlir::Value dst = funcOp.getArgument(0);
+  mlir::Value src = funcOp.getArgument(1);
+  llvm::SmallVector<mlir::Value> typeparams;
+  if (typeInfo.isBoxChar()) {
+    // fir.boxchar will be passed here as fir.ref<fir.boxchar>
+    auto loadDst = builder.create<fir::LoadOp>(loc, dst);
+    auto loadSrc = builder.create<fir::LoadOp>(loc, src);
+    // get the actual fir.ref<fir.char> type
+    mlir::Type refType =
+        fir::ReferenceType::get(mlir::cast<fir::BoxCharType>(eleTy).getEleTy());
+    auto unboxedDst = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadDst);
+    auto unboxedSrc = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadSrc);
+    // Add length to type parameters
+    typeparams.push_back(unboxedDst.getResult(1));
+    dst = unboxedDst.getResult(0);
+    src = unboxedSrc.getResult(0);
+  } else if (typeInfo.getCharLength().has_value()) {
+    mlir::Value charLen = builder.createIntegerConstant(
+        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
+    typeparams.push_back(charLen);
+  }
   auto declDst = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(0), copyFuncName + "_dst", shape, typeparams,
+      loc, dst, copyFuncName + "_dst", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   auto declSrc = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(1), copyFuncName + "_src", shape, typeparams,
+      loc, src, copyFuncName + "_src", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   converter.copyVar(loc, declDst.getBase(), declSrc.getBase(), varAttrs);
   builder.create<mlir::func::ReturnOp>(loc);
@@ -835,10 +858,13 @@ bool ClauseProcessor::processCopyprivate(
 
     // CopyPrivate variables must be passed by reference. However, in the case
     // of assumed shapes/vla the type is not a !fir.ref, but a !fir.box.
-    // In these cases to retrieve the appropriate !fir.ref<!fir.box<...>> to
-    // access the data we need we must perform an alloca and then store to it
-    // and retrieve the data from the new alloca.
-    if (mlir::isa<fir::BaseBoxType>(symType)) {
+    // In the case of character types, the passed in type can also be
+    // !fir.boxchar. In these cases to retrieve the appropriate
+    // !fir.ref<!fir.box<...>> or !fir.ref<!fir.boxchar<..>> to access the data
+    // we need we must perform an alloca and then store to it and retrieve the
+    // data from the new alloca.
+    if (mlir::isa<fir::BaseBoxType>(symType) ||
+        mlir::isa<fir::BoxCharType>(symType)) {
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
       auto alloca = builder.create<fir::AllocaOp>(currentLocation, symType);
       builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 4d0f5c3a127e1..c0c57d1832d4e 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -905,8 +905,8 @@ Inclusive make(const parser::OmpClause::Inclusive &inp,
 
 Indirect make(const parser::OmpClause::Indirect &inp,
               semantics::SemanticsContext &semaCtx) {
-  // inp -> empty
-  llvm_unreachable("Empty: indirect");
+  // inp.v.v -> std::optional<parser::ScalarLogicalExpr>
+  return Indirect{maybeApply(makeExprFn(semaCtx), inp.v.v)};
 }
 
 Init make(const parser::OmpClause::Init &inp,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 82673f0948a5b..3e865a1ee7185 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -201,6 +201,8 @@ class HostEvalInfo {
 /// structures, but it will probably still require some further work to support
 /// reverse offloading.
 static llvm::SmallVector<HostEvalInfo, 0> hostEvalInfo;
+static llvm::SmallVector<const parser::OpenMPSectionsConstruct *, 0>
+    sectionsStack;
 
 /// Bind symbols to their corresponding entry block arguments.
 ///
@@ -2220,8 +2222,12 @@ static mlir::omp::SectionsOp
 genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
-              const ConstructQueue &queue, ConstructQueue::const_iterator item,
-              const parser::OmpSectionBlocks &sectionBlocks) {
+              const ConstructQueue &queue,
+              ConstructQueue::const_iterator item) {
+  assert(!sectionsStack.empty());
+  const auto &sectionBlocks =
+      std::get<parser::OmpSectionBlocks>(sectionsStack.back()->t);
+  sectionsStack.pop_back();
   mlir::omp::SectionsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
@@ -2668,6 +2674,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::TeamsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -2975,6 +2982,7 @@ static mlir::omp::ParallelOp genStandaloneParallel(
     lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::ParallelOperands parallelClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
@@ -3458,10 +3466,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     // Lowered in the enclosing genSectionsOp.
     break;
   case llvm::omp::Directive::OMPD_sections:
-    // Called directly from genOMP([...], OpenMPSectionsConstruct) because it
-    // has a different prototype.
-    // This code path is still taken when iterating through the construct queue
-    // in genBodyOfOp
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_simd:
     newOp =
@@ -4024,13 +4029,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
       TODO(clauseLocation, name + " clause is not implemented yet");
     }
-
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_teams)
-      TODO(clauseLocation, "TARGET TEAMS PRIVATE is not implemented yet");
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_parallel)
-      TODO(clauseLocation, "TARGET PARALLEL PRIVATE is not implemented yet");
   }
 
   llvm::omp::Directive directive =
@@ -4137,8 +4135,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       std::get<parser::OmpClauseList>(beginSectionsDirective.t), semaCtx);
   const auto &endSectionsDirective =
       std::get<parser::OmpEndSectionsDirective>(sectionsConstruct.t);
-  const auto &sectionBlocks =
-      std::get<parser::OmpSectionBlocks>(sectionsConstruct.t);
   clauses.append(makeClauses(
       std::get<parser::OmpClauseList>(endSectionsDirective.t), semaCtx));
   mlir::Location currentLocation = converter.getCurrentLocation();
@@ -4150,22 +4146,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   ConstructQueue queue{
       buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
                           eval, source, directive, clauses)};
-  ConstructQueue::iterator next = queue.begin();
-  // Generate constructs that come first e.g. Parallel
-  while (next != queue.end() &&
-         next->id != llvm::omp::Directive::OMPD_sections) {
-    genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
-                   next);
-    next = std::next(next);
-  }
 
-  // call genSectionsOp directly (not via genOMPDispatch) so that we can add the
-  // sectionBlocks argument
-  assert(next != queue.end());
-  assert(next->id == llvm::omp::Directive::OMPD_sections);
-  genSectionsOp(converter, symTable, semaCtx, eval, currentLocation, queue,
-                next, sectionBlocks);
-  assert(std::next(next) == queue.end());
+  sectionsStack.push_back(&sectionsConstruct);
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 6181e1fad4240..ecfa2939e96a6 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4418,6 +4418,19 @@ mlir::ParseResult fir::IfOp::parse(mlir::OpAsmParser &parser,
       parser.resolveOperand(cond, i1Type, result.operands))
     return mlir::failure();
 
+  if (mlir::succeeded(
+          parser.parseOptionalKeyword(getWeightsAttrAssemblyName()))) {
+    if (parser.parseLParen())
+      return mlir::failure();
+    mlir::DenseI32ArrayAttr weights;
+    if (parser.parseCustomAttributeWithFallback(weights, mlir::Type{}))
+      return mlir::failure();
+    if (weights)
+      result.addAttribute(getRegionWeightsAttrName(result.name), weights);
+    if (parser.parseRParen())
+      return mlir::failure();
+  }
+
   if (parser.parseOptionalArrowTypeList(result.types))
     return mlir::failure();
 
@@ -4449,6 +4462,11 @@ llvm::LogicalResult fir::IfOp::verify() {
 void fir::IfOp::print(mlir::OpAsmPrinter &p) {
   bool printBlockTerminators = false;
   p << ' ' << getCondition();
+  if (auto weights = getRegionWeightsAttr()) {
+    p << ' ' << getWeightsAttrAssemblyName() << '(';
+    p.printStrippedAttrOrType(weights);
+    p << ')';
+  }
   if (!getResults().empty()) {
     p << " -> (" << getResultTypes() << ')';
     printBlockTerminators = true;
@@ -4464,7 +4482,8 @@ void fir::IfOp::print(mlir::OpAsmPrinter &p) {
     p.printRegion(otherReg, /*printEntryBlockArgs=*/false,
                   printBlockTerminators);
   }
-  p.printOptionalAttrDict((*this)->getAttrs());
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elideAttrs=*/{getRegionWeightsAttrName()});
 }
 
 void fir::IfOp::resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 8a9e9b80134b8..3d35803e6a2d3 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -212,9 +212,12 @@ class CfgIfConv : public mlir::OpRewritePattern<fir::IfOp> {
     }
 
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<mlir::cf::CondBranchOp>(
+    auto branchOp = rewriter.create<mlir::cf::CondBranchOp>(
         loc, ifOp.getCondition(), ifOpBlock, llvm::ArrayRef<mlir::Value>(),
         otherwiseBlock, llvm::ArrayRef<mlir::Value>());
+    llvm::ArrayRef<int32_t> weights = ifOp.getWeights();
+    if (!weights.empty())
+      branchOp.setWeights(weights);
     rewriter.replaceOp(ifOp, continueBlock->getArguments());
     return success();
   }
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 9b112a2133918..c55642d969503 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1004,6 +1004,8 @@ TYPE_PARSER( //
     "IF" >> construct<OmpClause>(construct<OmpClause::If>(
                 parenthesized(Parser<OmpIfClause>{}))) ||
     "INBRANCH" >> construct<OmpClause>(construct<OmpClause::Inbranch>()) ||
+    "INDIRECT" >> construct<OmpClause>(construct<OmpClause::Indirect>(
+                      maybe(parenthesized(scalarLogicalExpr)))) ||
     "INIT" >> construct<OmpClause>(construct<OmpClause::Init>(
                   parenthesized(Parser<OmpInitClause>{}))) ||
     "INCLUSIVE" >> construct<OmpClause>(construct<OmpClause::Inclusive>(
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 2c215f45bf516..08053594c12e4 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -10,6 +10,7 @@
 #include "assignment.h"
 #include "definable.h"
 #include "flang/Evaluate/fold.h"
+#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -33,6 +34,7 @@ struct AllocateCheckerInfo {
   bool gotMold{false};
   bool gotStream{false};
   bool gotPinned{false};
+  std::optional<evaluate::ConstantSubscripts> sourceExprShape;
 };
 
 class AllocationCheckerHelper {
@@ -259,6 +261,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
           CheckCopyabilityInPureScope(messages, *expr, scope);
         }
       }
+      auto maybeShape{evaluate::GetShape(context.foldingContext(), *expr)};
+      info.sourceExprShape =
+          evaluate::AsConstantExtents(context.foldingContext(), maybeShape);
     } else {
       // Error already reported on source expression.
       // Do not continue allocate checks.
@@ -581,6 +586,52 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
             .Attach(
                 ultimate_->name(), "Declared here with rank %d"_en_US, rank_);
         return false;
+      } else if (allocateInfo_.gotSource && allocateInfo_.sourceExprShape &&
+          allocateInfo_.sourceExprShape->size() ==
+              static_cast<std::size_t>(allocateShapeSpecRank_)) {
+        std::size_t j{0};
+        for (const auto &shapeSpec :
+            std::get<std::list<parser::AllocateShapeSpec>>(allocation_.t)) {
+          if (j >= allocateInfo_.sourceExprShape->size()) {
+            break;
+          }
+          std::optional<evaluate::ConstantSubscript> lbound;
+          if (const auto &lb{std::get<0>(shapeSpec.t)}) {
+            lbound.reset();
+            const auto &lbExpr{lb->thing.thing.value()};
+            if (const auto *expr{GetExpr(context, lbExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              lbound = evaluate::ToInt64(folded);
+              evaluate::SetExpr(lbExpr, std::move(folded));
+            }
+          } else {
+            lbound = 1;
+          }
+          if (lbound) {
+            const auto &ubExpr{std::get<1>(shapeSpec.t).thing.thing.value()};
+            if (const auto *expr{GetExpr(context, ubExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              auto ubound{evaluate::ToInt64(folded)};
+              evaluate::SetExpr(ubExpr, std::move(folded));
+              if (ubound) {
+                auto extent{*ubound - *lbound + 1};
+                if (extent < 0) {
+                  extent = 0;
+                }
+                if (extent != allocateInfo_.sourceExprShape->at(j)) {
+                  context.Say(name_.source,
+                      "Allocation has extent %jd on dimension %d, but SOURCE= has extent %jd"_err_en_US,
+                      static_cast<std::intmax_t>(extent), j + 1,
+                      static_cast<std::intmax_t>(
+                          allocateInfo_.sourceExprShape->at(j)));
+                }
+              }
+            }
+          }
+          ++j;
+        }
       }
     }
   } else { // allocating a scalar object
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 58d28dce7094a..83f4d1edf3c4f 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1820,15 +1820,24 @@ void OmpStructureChecker::Leave(const parser::OmpDeclareTargetWithClause &x) {
     const parser::OmpClause *toClause = FindClause(llvm::omp::Clause::OMPC_to);
     const parser::OmpClause *linkClause =
         FindClause(llvm::omp::Clause::OMPC_link);
+    const parser::OmpClause *indirectClause =
+        FindClause(llvm::omp::Clause::OMPC_indirect);
     if (!enterClause && !toClause && !linkClause) {
       context_.Say(x.source,
           "If the DECLARE TARGET directive has a clause, it must contain at least one ENTER clause or LINK clause"_err_en_US);
     }
+    if (indirectClause && !enterClause) {
+      context_.Say(x.source,
+          "The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive."_err_en_US);
+    }
     unsigned version{context_.langOptions().OpenMPVersion};
     if (toClause && version >= 52) {
       context_.Warn(common::UsageWarning::OpenMPUsage, toClause->source,
           "The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead."_warn_en_US);
     }
+    if (indirectClause) {
+      CheckAllowedClause(llvm::omp::Clause::OMPC_indirect);
+    }
   }
 }
 
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 9f9e9f5840456..82c8536902eb2 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -109,15 +109,14 @@ bool ModFileWriter::WriteAll() {
 }
 
 void ModFileWriter::WriteAll(const Scope &scope) {
-  for (const auto &child : scope.children()) {
+  for (const Scope &child : scope.children()) {
     WriteOne(child);
   }
 }
 
 void ModFileWriter::WriteOne(const Scope &scope) {
   if (scope.kind() == Scope::Kind::Module) {
-    auto *symbol{scope.symbol()};
-    if (!symbol->test(Symbol::Flag::ModFile)) {
+    if (const auto *symbol{scope.symbol()}) {
       Write(*symbol);
     }
     WriteAll(scope); // write out submodules
@@ -134,7 +133,7 @@ static std::string ModFileName(const SourceName &name,
 // Write the module file for symbol, which must be a module or submodule.
 void ModFileWriter::Write(const Symbol &symbol) {
   const auto &module{symbol.get<ModuleDetails>()};
-  if (module.moduleFileHash()) {
+  if (symbol.test(Symbol::Flag::ModFile) || module.moduleFileHash()) {
     return; // already written
   }
   const auto *ancestor{module.ancestor()};
@@ -372,16 +371,19 @@ void ModFileWriter::PutSymbols(
   CollectSymbols(scope, sorted, uses, modules);
   // Write module files for dependencies first so that their
   // hashes are known.
-  for (auto ref : modules) {
+  for (const Symbol &mod : modules) {
     if (hermeticModules) {
-      hermeticModules->insert(*ref);
+      hermeticModules->insert(mod);
     } else {
-      Write(*ref);
-      needs_ << ModHeader::need
-             << CheckSumString(
-                    ref->get<ModuleDetails>().moduleFileHash().value())
-             << (ref->owner().IsIntrinsicModules() ? " i " : " n ")
-             << ref->name().ToString() << '\n';
+      Write(mod);
+      // It's possible that the module's file already existed and
+      // without its own hash due to being embedded in a hermetic
+      // module file.
+      if (auto hash{mod.get<ModuleDetails>().moduleFileHash()}) {
+        needs_ << ModHeader::need << CheckSumString(*hash)
+               << (mod.owner().IsIntrinsicModules() ? " i " : " n ")
+               << mod.name().ToString() << '\n';
+      }
     }
   }
   std::string buf; // stuff after CONTAINS in derived type
@@ -855,25 +857,25 @@ void CollectSymbols(const Scope &scope, SymbolVector &sorted,
   auto symbols{scope.GetSymbols()};
   std::size_t commonSize{scope.commonBlocks().size()};
   sorted.reserve(symbols.size() + commonSize);
-  for (SymbolRef symbol : symbols) {
-    const auto *generic{symbol->detailsIf<GenericDetails>()};
+  for (const Symbol &symbol : symbols) {
+    const auto *generic{symbol.detailsIf<GenericDetails>()};
     if (generic) {
       uses.insert(uses.end(), generic->uses().begin(), generic->uses().end());
-      for (auto ref : generic->uses()) {
-        modules.insert(GetUsedModule(ref->get<UseDetails>()));
+      for (const Symbol &used : generic->uses()) {
+        modules.insert(GetUsedModule(used.get<UseDetails>()));
       }
-    } else if (const auto *use{symbol->detailsIf<UseDetails>()}) {
+    } else if (const auto *use{symbol.detailsIf<UseDetails>()}) {
       modules.insert(GetUsedModule(*use));
     }
-    if (symbol->test(Symbol::Flag::ParentComp)) {
-    } else if (symbol->has<NamelistDetails>()) {
+    if (symbol.test(Symbol::Flag::ParentComp)) {
+    } else if (symbol.has<NamelistDetails>()) {
       namelist.push_back(symbol);
     } else if (generic) {
       if (generic->specific() &&
-          &generic->specific()->owner() == &symbol->owner()) {
+          &generic->specific()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->specific());
       } else if (generic->derivedType() &&
-          &generic->derivedType()->owner() == &symbol->owner()) {
+          &generic->derivedType()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->derivedType());
       }
       generics.push_back(symbol);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index b5f8667fe36f2..282660684e78a 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -384,6 +384,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPSectionsConstruct &);
   void Post(const parser::OpenMPSectionsConstruct &) { PopContext(); }
 
+  bool Pre(const parser::OpenMPSectionConstruct &);
+  void Post(const parser::OpenMPSectionConstruct &) { PopContext(); }
+
   bool Pre(const parser::OpenMPCriticalConstruct &critical);
   void Post(const parser::OpenMPCriticalConstruct &) { PopContext(); }
 
@@ -2003,6 +2006,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) {
   return true;
 }
 
+bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
+  PushContext(x.source, llvm::omp::Directive::OMPD_section);
+  GetContext().withinConstruct = true;
+  return true;
+}
+
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const auto &beginCriticalDir{std::get<parser::OmpCriticalDirective>(x.t)};
   const auto &endCriticalDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
@@ -3024,8 +3033,13 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
     const parser::CharBlock target, std::optional<DirContext> sourceContext,
     std::optional<DirContext> targetContext) {
   auto dirContextsSame = [](DirContext &lhs, DirContext &rhs) -> bool {
-    // Sometimes nested constructs share a scope but are different contexts
-    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive);
+    // Sometimes nested constructs share a scope but are different contexts.
+    // The directiveSource comparison is for OmpSection. Sections do not have
+    // their own scopes and two different sections both have the same directive.
+    // Their source however is different. This string comparison is unfortunate
+    // but should only happen for GOTOs inside of SECTION.
+    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive) &&
+        (lhs.directiveSource == rhs.directiveSource);
   };
   unsigned version{context_.langOptions().OpenMPVersion};
   if (targetContext &&
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e23e91b674a73..f66918e5c140e 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1729,7 +1729,6 @@ bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
   switch (beginDir.v) {
   case llvm::omp::Directive::OMPD_master:
   case llvm::omp::Directive::OMPD_ordered:
-  case llvm::omp::Directive::OMPD_taskgroup:
     return false;
   default:
     return true;
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 26ae81f97895a..51ba21a9e5edf 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -82,17 +82,17 @@ class RuntimeTableBuilder {
       const SomeExpr &genre, std::int64_t = 0) const;
   SomeExpr PackageIntValueExpr(const SomeExpr &genre, std::int64_t = 0) const;
   std::vector<evaluate::StructureConstructor> DescribeBindings(
-      const Scope &dtScope, Scope &);
+      const Scope &dtScope, Scope &, const SymbolVector &bindings);
   std::map<int, evaluate::StructureConstructor> DescribeSpecialGenerics(
-      const Scope &dtScope, const Scope &thisScope,
-      const DerivedTypeSpec *) const;
+      const Scope &dtScope, const Scope &thisScope, const DerivedTypeSpec *,
+      const SymbolVector &bindings) const;
   void DescribeSpecialGeneric(const GenericDetails &,
       std::map<int, evaluate::StructureConstructor> &, const Scope &,
-      const DerivedTypeSpec *) const;
+      const DerivedTypeSpec *, const SymbolVector &bindings) const;
   void DescribeSpecialProc(std::map<int, evaluate::StructureConstructor> &,
       const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
       std::optional<common::DefinedIo>, const Scope *, const DerivedTypeSpec *,
-      bool isTypeBound) const;
+      const SymbolVector *bindings) const;
   void IncorporateDefinedIoGenericInterfaces(
       std::map<int, evaluate::StructureConstructor> &, common::DefinedIo,
       const Scope *, const DerivedTypeSpec *);
@@ -595,8 +595,9 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     // Compile the "vtable" of type-bound procedure bindings
     std::uint32_t specialBitSet{0};
     if (!dtSymbol->attrs().test(Attr::ABSTRACT)) {
+      SymbolVector boundProcedures{CollectBindings(dtScope)};
       std::vector<evaluate::StructureConstructor> bindings{
-          DescribeBindings(dtScope, scope)};
+          DescribeBindings(dtScope, scope, boundProcedures)};
       AddValue(dtValues, derivedTypeSchema_, bindingDescCompName,
           SaveDerivedPointerTarget(scope,
               SaveObjectName(
@@ -609,12 +610,14 @@ const Symbol *RuntimeTableBuilder::DescribeType(
       // subroutines override any parent bindings, but FINAL subroutines do not
       // (the runtime will call all of them).
       std::map<int, evaluate::StructureConstructor> specials{
-          DescribeSpecialGenerics(dtScope, dtScope, derivedTypeSpec)};
+          DescribeSpecialGenerics(
+              dtScope, dtScope, derivedTypeSpec, boundProcedures)};
       if (derivedTypeSpec) {
-        for (auto &ref : FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
-          DescribeSpecialProc(specials, *ref, /*isAssignment-*/ false,
+        for (const Symbol &symbol :
+            FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
+          DescribeSpecialProc(specials, symbol, /*isAssignment-*/ false,
               /*isFinal=*/true, std::nullopt, nullptr, derivedTypeSpec,
-              /*isTypeBound=*/true);
+              &boundProcedures);
         }
         IncorporateDefinedIoGenericInterfaces(specials,
             common::DefinedIo::ReadFormatted, &scope, derivedTypeSpec);
@@ -661,6 +664,10 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
+    // Similarly, a flag to enable optimized runtime assignment.
+    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
+        IntExpr<1>(
+            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
@@ -1041,15 +1048,16 @@ SymbolVector CollectBindings(const Scope &dtScope) {
 }
 
 std::vector<evaluate::StructureConstructor>
-RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
+RuntimeTableBuilder::DescribeBindings(
+    const Scope &dtScope, Scope &scope, const SymbolVector &bindings) {
   std::vector<evaluate::StructureConstructor> result;
-  for (const SymbolRef &ref : CollectBindings(dtScope)) {
+  for (const Symbol &symbol : bindings) {
     evaluate::StructureConstructorValues values;
     AddValue(values, bindingSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{
-            ref.get().get<ProcBindingDetails>().symbol()}});
+            symbol.get<ProcBindingDetails>().symbol()}});
     AddValue(values, bindingSchema_, "name"s,
-        SaveNameAsPointerTarget(scope, ref.get().name().ToString()));
+        SaveNameAsPointerTarget(scope, symbol.name().ToString()));
     result.emplace_back(DEREF(bindingSchema_.AsDerived()), std::move(values));
   }
   return result;
@@ -1057,16 +1065,18 @@ RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
 
 std::map<int, evaluate::StructureConstructor>
 RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
-    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   std::map<int, evaluate::StructureConstructor> specials;
   if (const Scope * parentScope{dtScope.GetDerivedTypeParent()}) {
-    specials =
-        DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec);
+    specials = DescribeSpecialGenerics(
+        *parentScope, thisScope, derivedTypeSpec, bindings);
   }
   for (const auto &pair : dtScope) {
     const Symbol &symbol{*pair.second};
     if (const auto *generic{symbol.detailsIf<GenericDetails>()}) {
-      DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec);
+      DescribeSpecialGeneric(
+          *generic, specials, thisScope, derivedTypeSpec, bindings);
     }
   }
   return specials;
@@ -1074,15 +1084,16 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
 
 void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
     std::map<int, evaluate::StructureConstructor> &specials,
-    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   common::visit(
       common::visitors{
           [&](const GenericKind::OtherKind &k) {
             if (k == GenericKind::OtherKind::Assignment) {
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/true,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/true,
                     /*isFinal=*/false, std::nullopt, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
             }
           },
@@ -1092,10 +1103,10 @@ void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
             case common::DefinedIo::ReadUnformatted:
             case common::DefinedIo::WriteFormatted:
             case common::DefinedIo::WriteUnformatted:
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/false,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/false,
                     /*isFinal=*/false, io, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
               break;
             }
@@ -1109,7 +1120,8 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     std::map<int, evaluate::StructureConstructor> &specials,
     const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
     std::optional<common::DefinedIo> io, const Scope *dtScope,
-    const DerivedTypeSpec *derivedTypeSpec, bool isTypeBound) const {
+    const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector *bindings) const {
   const auto *binding{specificOrBinding.detailsIf<ProcBindingDetails>()};
   if (binding && dtScope) { // use most recent override
     binding = &DEREF(dtScope->FindComponent(specificOrBinding.name()))
@@ -1128,6 +1140,9 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       // component assignment as part of intrinsic assignment.
       // Non-type-bound generic INTERFACEs and assignments from incompatible
       // types must not be used for component intrinsic assignment.
+      if (!binding) {
+        return;
+      }
       CHECK(proc->dummyArguments.size() == 2);
       const auto t1{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
@@ -1137,7 +1152,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
                     &proc->dummyArguments[1].u))
               .type.type()};
-      if (!binding || t1.category() != TypeCategory::Derived ||
+      if (t1.category() != TypeCategory::Derived ||
           t2.category() != TypeCategory::Derived ||
           t1.IsUnlimitedPolymorphic() || t2.IsUnlimitedPolymorphic()) {
         return;
@@ -1149,7 +1164,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       }
       which = proc->IsElemental() ? elementalAssignmentEnum_
                                   : scalarAssignmentEnum_;
-      if (binding && binding->passName() &&
+      if (binding->passName() &&
           *binding->passName() == proc->dummyArguments[1].name) {
         argThatMightBeDescriptor = 1;
         isArgDescriptorSet |= 2;
@@ -1234,8 +1249,19 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         values, specialSchema_, "which"s, SomeExpr{std::move(which.value())});
     AddValue(values, specialSchema_, "isargdescriptorset"s,
         IntExpr<1>(isArgDescriptorSet));
-    AddValue(values, specialSchema_, "istypebound"s,
-        IntExpr<1>(isTypeBound ? 1 : 0));
+    int bindingIndex{0};
+    if (bindings) {
+      int j{0};
+      for (const Symbol &bind : DEREF(bindings)) {
+        ++j;
+        if (&bind.get<ProcBindingDetails>().symbol() == &specific) {
+          bindingIndex = j; // index offset by 1
+          break;
+        }
+      }
+    }
+    CHECK(bindingIndex <= 255);
+    AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex));
     AddValue(values, specialSchema_, "isargcontiguousset"s,
         IntExpr<1>(isArgContiguousSet));
     AddValue(values, specialSchema_, procCompName,
@@ -1260,7 +1286,7 @@ void RuntimeTableBuilder::IncorporateDefinedIoGenericInterfaces(
       CHECK(std::get<common::DefinedIo>(genericDetails.kind().u) == definedIo);
       for (auto ref : genericDetails.specificProcs()) {
         DescribeSpecialProc(specials, *ref, false, false, definedIo, nullptr,
-            derivedTypeSpec, false);
+            derivedTypeSpec, /*bindings=*/nullptr);
       }
     }
   }
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index a1445187b1e98..bf520d04a50cc 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -814,6 +814,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
+static bool MayHaveDefinedAssignment(
+    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
+  if (const Scope *scope{derived.GetScope()};
+      scope && checked.find(scope) == checked.end()) {
+    checked.insert(scope);
+    for (const auto &[_, symbolRef] : *scope) {
+      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
+        if (generic->kind().IsAssignment()) {
+          return true;
+        }
+      } else if (symbolRef->has<ObjectEntityDetails>() &&
+          !IsPointer(*symbolRef)) {
+        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
+          if (type->IsPolymorphic()) {
+            return true;
+          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+            if (MayHaveDefinedAssignment(*derived, checked)) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
+  std::set<const Scope *> checked;
+  return MayHaveDefinedAssignment(derived, checked);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index b30a6bf697563..8dd27d6e4c01b 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,7 +52,8 @@
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: __padding0(4)
+    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
+    integer(1) :: __padding0(3)
   end type
 
   type :: Binding
@@ -116,7 +117,7 @@
   type, bind(c) :: SpecialBinding
     integer(1) :: which ! SpecialBinding::Which
     integer(1) :: isArgDescriptorSet
-    integer(1) :: isTypeBound
+    integer(1) :: isTypeBound ! binding index + 1, if any
     integer(1) :: isArgContiguousSet
     integer(1) :: __padding0(4)
     type(__builtin_c_funptr) :: proc
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index a658f6f984faf..8520bec646971 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -73,6 +73,7 @@ if (NOT FLANG_STANDALONE_BUILD)
     not
     llvm-dis
     llvm-objdump
+    llvm-profdata
     llvm-readobj
     split-file
   )
diff --git a/flang/test/Driver/flang-openmp-version-macro.f90 b/flang/test/Driver/flang-openmp-version-macro.f90
index 95b3071544d06..f690ab3819482 100644
--- a/flang/test/Driver/flang-openmp-version-macro.f90
+++ b/flang/test/Driver/flang-openmp-version-macro.f90
@@ -2,7 +2,6 @@
 
 ! RUN: %flang_fc1 -fopenmp -cpp -E %s | FileCheck %s --check-prefix=DEFAULT-OPENMP-VERSION
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=20 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-20
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=25 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-25
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=30 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-30
@@ -12,6 +11,7 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-50
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-51
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-52
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-60
 
 ! DEFAULT-OPENMP-VERSION: integer :: var1 = 201107
 ! OPENMP-VERSION-11: integer :: var1 = 199911
@@ -24,6 +24,7 @@
 ! OPENMP-VERSION-50: integer :: var1 = 201811
 ! OPENMP-VERSION-51: integer :: var1 = 202011
 ! OPENMP-VERSION-52: integer :: var1 = 202111
+! OPENMP-VERSION-60: integer :: var1 = 202411
 
 #if _OPENMP
   integer :: var1 = _OPENMP
diff --git a/flang/test/Fir/cfg-conversion-if.fir b/flang/test/Fir/cfg-conversion-if.fir
new file mode 100644
index 0000000000000..1e30ee8e64f02
--- /dev/null
+++ b/flang/test/Fir/cfg-conversion-if.fir
@@ -0,0 +1,46 @@
+// RUN: fir-opt --split-input-file --cfg-conversion %s | FileCheck %s
+
+func.func private @callee() -> none
+
+// CHECK-LABEL:   func.func @if_then(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]] weights([10, 90]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_0:.*]] = fir.call @callee() : () -> none
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           return
+// CHECK:         }
+func.func @if_then(%cond: i1) {
+  fir.if %cond weights([10, 90]) {
+    fir.call @callee() : () -> none
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @if_then_else(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           cf.cond_br %[[ARG0]] weights([90, 10]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb3(%[[VAL_0]] : i32)
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb3(%[[VAL_1]] : i32)
+// CHECK:         ^bb3(%[[VAL_2:.*]]: i32):
+// CHECK:           cf.br ^bb4
+// CHECK:         ^bb4:
+// CHECK:           return %[[VAL_2]] : i32
+// CHECK:         }
+func.func @if_then_else(%cond: i1) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %result = fir.if %cond weights([90, 10]) -> i32 {
+    fir.result %c0 : i32
+  } else {
+    fir.result %c1 : i32
+  }
+  return %result : i32
+}
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index 9c444d2f4e0bc..3585bf9efca3e 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -1015,3 +1015,19 @@ func.func @test_box_total_elements(%arg0: !fir.class<!fir.type<sometype{i:i32}>>
   %6 = arith.addi %2, %5 : index
   return %6 : index
 }
+
+// CHECK-LABEL:   func.func @test_if_weights(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+func.func @test_if_weights(%cond: i1) {
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           }
+  fir.if %cond weights([99, 1]) {
+  }
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           } else {
+// CHECK:           }
+  fir.if %cond weights ([99,1]) {
+  } else {
+  }
+  return
+}
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index 45cae1f82cb8e..aca0ecc1abdc1 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1393,3 +1393,31 @@ fir.local {type = local_init} @x.localizer : f32 init {
 ^bb0(%arg0: f32, %arg1: f32):
   fir.yield(%arg0 : f32)
 }
+
+// -----
+
+func.func @wrong_weights_number_in_if_then(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 1 vs 2}}
+  fir.if %cond weights([50]) {
+  }
+  return
+}
+
+// -----
+
+func.func @wrong_weights_number_in_if_then_else(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 3 vs 2}}
+  fir.if %cond weights([50, 40, 10]) {
+  } else {
+  }
+  return
+}
+
+// -----
+
+func.func @negative_weight_in_if_then(%cond: i1) {
+// expected-error @below {{weight #0 must be non-negative}}
+  fir.if %cond weights([-1, 101]) {
+  }
+  return
+}
diff --git a/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90 b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
new file mode 100644
index 0000000000000..d441cac47f5da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fc1 -emit-fir -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90 b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
deleted file mode 100644
index e820143021f9a..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target parallel`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET PARALLEL PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target parallel private(i)
-!$omp end target parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90 b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
deleted file mode 100644
index c8d998a5cbf94..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target teams`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET TEAMS PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target teams private(i)
-!$omp end target teams
-end subroutine
diff --git a/flang/test/Lower/OpenMP/copyprivate5.f90 b/flang/test/Lower/OpenMP/copyprivate5.f90
new file mode 100644
index 0000000000000..c75eb82a45e9f
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyprivate5.f90
@@ -0,0 +1,36 @@
+! Test lowering of COPYPRIVATE with character arguments
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Testcase from: https://github.com/llvm/llvm-project/issues/142123
+
+! CHECK-LABEL:  func.func private @_copy_boxchar_c8xU(
+! CHECK-SAME:     %arg0: [[TYPE:!fir.ref<!fir.boxchar<1>>]],
+! CHECK-SAME:     %arg1: [[TYPE]]) attributes {llvm.linkage = #llvm.linkage<internal>} {
+! CHECK:    %[[RDST:.*]] = fir.load %arg0 : [[TYPE]]
+! CHECK:    %[[RSRC:.*]] = fir.load %arg1 : [[TYPE]]
+! CHECK:    %[[UDST:.*]]:2 = fir.unboxchar %[[RDST:.*]] : ([[UTYPE:!fir.boxchar<1>]]) -> ([[RTYPE:!fir.ref<!fir.char<1,\?>>]], [[ITYPE:index]])
+! CHECK:    %[[USRC:.*]]:2 = fir.unboxchar %[[RSRC:.*]] : ([[UTYPE]]) -> ([[RTYPE]], [[ITYPE]])
+! CHECK:    %[[DST:.*]]:2 = hlfir.declare %[[UDST:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME1:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    %[[SRC:.*]]:2 = hlfir.declare %[[USRC:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME2:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    hlfir.assign %[[SRC:.*]]#0 to %[[DST:.*]]#0 : [[UTYPE]], [[UTYPE]]
+! CHECK:    return
+! CHECK:  }
+
+! CHECK-LABEL: func.func @_QPs(%arg0: !fir.boxchar<1> {fir.bindc_name = "c"}) {
+! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.boxchar<1>
+! CHECK: fir.store %[[SRC:.*]] to %[[ALLOC:.*]] : !fir.ref<!fir.boxchar<1>>
+! CHECK: omp.single copyprivate([[ALLOC:.*]] -> @_copy_boxchar_c8xU : !fir.ref<!fir.boxchar<1>>) {
+! CHECK:   hlfir.assign %[[NEW_VAL:.*]] to %[[SRC:.*]] : !fir.ref<!fir.char<1,3>>, !fir.boxchar<1>
+! CHECK:   omp.terminator
+! CHECK: }
+
+subroutine s(c)
+character(*) :: c
+!$omp single copyprivate(c)
+c = "bar"
+!$omp end single
+end subroutine
+
+character(len=3) :: c
+call s(c)
+end
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index f0f149bb415b0..0d2db63edfe79 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -5,6 +5,14 @@
 
 ! Privatizers
 
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
 ! CHECK-LABEL: omp.private
 ! CHECK-SAME:      {type = private} @[[TEST6_Y_PRIV:.*]] : i32
 ! CHECK-NOT:   copy {
@@ -277,22 +285,19 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !CHECK-LABEL: func @_QPimplicit_dsa_test7
 !CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test7Ex"}
 !CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:       omp.task {
+!CHECK:       omp.task private(@[[TEST7_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:[^,]*]],
+!CHECK-SAME:      @[[TEST7_Y_FIRSTPRIV]] %[[Y_DECL]]#0 -> %[[PRIV_Y:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK:         %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"}
+!CHECK:         %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:         omp.taskgroup {
-!CHECK-NEXT:      %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"}
-!CHECK-NEXT:      %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK-NEXT:      %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"}
-!CHECK-NEXT:      %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:         }
 !CHECK:       }
 subroutine implicit_dsa_test7
diff --git a/flang/test/Lower/OpenMP/target-parallel-private.f90 b/flang/test/Lower/OpenMP/target-parallel-private.f90
new file mode 100644
index 0000000000000..cc04b77e4a527
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-parallel-private.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target parallel`
+!===============================================================================
+
+subroutine target_parallel_private()
+integer, dimension(3) :: i
+!$omp target parallel private(i)
+!$omp end target parallel
+end subroutine
+
+! CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : {{.*}}
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.parallel private(@[[PRIVATIZER]] %{{.*}} -> %{{.*}} : {{.*}}) {
+! CHECK:   }
+! CHECK: }
diff --git a/flang/test/Lower/OpenMP/target-teams-private.f90 b/flang/test/Lower/OpenMP/target-teams-private.f90
new file mode 100644
index 0000000000000..65d97649b5cf3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-teams-private.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target teams`
+!===============================================================================
+
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target teams private(i)
+!$omp end target teams
+end subroutine
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.teams {
+! CHECK:     %{{.*}} = fir.alloca !fir.array<3xi32> {bindc_name = "i", {{.*}}}
+! CHECK:   }
+! CHECK: }
diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90
new file mode 100644
index 0000000000000..1e996a030c23a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup02.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Check that variables are not privatized twice when TASKGROUP is used.
+
+!CHECK-LABEL: func.func @_QPsub() {
+!CHECK:         omp.parallel {
+!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"}
+!CHECK:           omp.master {
+!CHECK:             omp.taskgroup {
+!CHECK-NEXT:          omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref<i32>) {
+!CHECK:                 %[[TASK_I_DECL:.*]]:2 = hlfir.declare %[[TASK_I]] {uniq_name = "_QFsubEi"}
+!CHECK:               }
+!CHECK:             }
+!CHECK:           }
+!CHECK:         }
+
+subroutine sub()
+  integer, dimension(10) :: a
+  integer :: i
+
+  !$omp parallel
+    !$omp master
+      do i=1,10
+       !$omp taskgroup
+         !$omp task shared(a)
+           a(i) = 1
+         !$omp end task
+       !$omp end taskgroup
+      end do
+    !$omp end master
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 28f0bf78f33c9..2e05b652822b5 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90 b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
new file mode 100644
index 0000000000000..df85942ec15a5
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
@@ -0,0 +1,53 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 %openmp_flags -fopenmp-version=52 -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp-version=52 %s | FileCheck %s --check-prefix="UNPARSE"
+
+module functions
+  implicit none
+
+  interface
+  function func() result(i)
+    character(1) :: i
+  end function
+  end interface
+
+contains
+  function func1() result(i)
+    !$omp declare target enter(func1) indirect(.true.)
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func1'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause -> Scalar -> Logical -> Expr = '.true._4'
+    !CHECK-NEXT: | | | | | | LiteralConstant -> LogicalLiteralConstant
+    !CHECK-NEXT: | | | | | | | bool = 'true'
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+
+  function func2() result(i)
+    !$omp declare target enter(func2) indirect
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func2'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause ->
+    character(1) :: i
+    i = 'b'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1, ptr2=>func2
+  character(1) :: val1, val2
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+  !$omp target map(from: val2)
+  val2 = ptr2()
+  !$omp end target
+
+end program
+
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func1) INDIRECT(.true._4)
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func2) INDIRECT()
diff --git a/flang/test/Preprocessing/bug518.F b/flang/test/Preprocessing/bug518.F
index 346e04cc56d38..0b680dd5751b9 100644
--- a/flang/test/Preprocessing/bug518.F
+++ b/flang/test/Preprocessing/bug518.F
@@ -1,4 +1,4 @@
-! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
 ! CHECK: k=1_4
                         k=                                            1_99999999
      &4
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 4a07e256e2bb6..1ee777d6b9723 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -141,7 +141,7 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !DEF: /implicit_dsa_test7 (Subroutine) Subprogram
 subroutine implicit_dsa_test7
   !DEF: /implicit_dsa_test7/x ObjectEntity INTEGER(4)
@@ -150,8 +150,8 @@ subroutine implicit_dsa_test7
 
   !$omp task
     !$omp taskgroup
-      !DEF: /implicit_dsa_test7/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
-      !DEF: /implicit_dsa_test7/OtherConstruct1/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
       x = y
     !$omp end taskgroup
   !$omp end task
diff --git a/flang/test/Semantics/OpenMP/parallel-sections01.f90 b/flang/test/Semantics/OpenMP/parallel-sections01.f90
index 6c5a053bf49c9..19448258af766 100644
--- a/flang/test/Semantics/OpenMP/parallel-sections01.f90
+++ b/flang/test/Semantics/OpenMP/parallel-sections01.f90
@@ -35,6 +35,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block
diff --git a/flang/test/Semantics/OpenMP/sections-goto.f90 b/flang/test/Semantics/OpenMP/sections-goto.f90
new file mode 100644
index 0000000000000..9fa9df9f50b9c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/sections-goto.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Regression test for #143231
+
+!$omp sections
+! ERROR: invalid branch into an OpenMP structured block
+! ERROR: invalid branch leaving an OpenMP structured block
+goto 10
+!$omp section
+10 print *, "Invalid jump"
+!$omp end sections
+end
diff --git a/flang/test/Semantics/OpenMP/sections02.f90 b/flang/test/Semantics/OpenMP/sections02.f90
index ee29922a72c08..8144b491071d8 100644
--- a/flang/test/Semantics/OpenMP/sections02.f90
+++ b/flang/test/Semantics/OpenMP/sections02.f90
@@ -19,6 +19,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block
diff --git a/flang/test/Semantics/allocate11.f90 b/flang/test/Semantics/allocate11.f90
index 1b7495e9fc07d..8aeb069df09f2 100644
--- a/flang/test/Semantics/allocate11.f90
+++ b/flang/test/Semantics/allocate11.f90
@@ -163,6 +163,7 @@ subroutine C938_C947(var2, ptr, ptr2, fptr, my_team, srca)
   allocate(var2(2)[5:*], MOLD=my_team)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], MOLD=ptr)
+  !ERROR: Allocation has extent 2 on dimension 1, but SOURCE= has extent 9
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], SOURCE=ptr2)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
diff --git a/flang/test/Semantics/indirect01.f90 b/flang/test/Semantics/indirect01.f90
new file mode 100644
index 0000000000000..59850662275d9
--- /dev/null
+++ b/flang/test/Semantics/indirect01.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive.
+    !$omp declare target indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Semantics/indirect02.f90 b/flang/test/Semantics/indirect02.f90
new file mode 100644
index 0000000000000..273f8856626b7
--- /dev/null
+++ b/flang/test/Semantics/indirect02.f90
@@ -0,0 +1,36 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s --check-prefix="CHECK-50"
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s --check-prefix="CHECK-52"
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK-50: INDIRECT clause is not allowed on directive DECLARE TARGET in OpenMP v5.0, try -fopenmp-version=51
+    !CHECK-52: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Semantics/modfile71.F90 b/flang/test/Semantics/modfile71.F90
index 7c3c7f5b48958..7f32eb18c6f8f 100644
--- a/flang/test/Semantics/modfile71.F90
+++ b/flang/test/Semantics/modfile71.F90
@@ -1,6 +1,7 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 %s
-!RUN: not %flang_fc1 -fsyntax-only -pedantic %s 2>&1 | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 -J%t %s
+!RUN: not %flang_fc1 -fsyntax-only -pedantic -J%t %s 2>&1 | FileCheck %s
 
 ! Tests that a module captured in a hermetic module file is compatible when
 ! USE'd with a module of the same name USE'd directly.
diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index aba00ffac848a..8f7adafe7204d 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang_fc1 -fdebug-unparse %s | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1
 module modfile75a
diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90
index 50ee9a088e119..c7ae91bd42bed 100644
--- a/flang/test/Semantics/modfile76.F90
+++ b/flang/test/Semantics/modfile76.F90
@@ -1,23 +1,24 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -J%t %s
 
 ! Tests that a BIND(C) variable in a module A captured in a hermetic module
 ! file USE'd in a module B is not creating bogus complaints about BIND(C) name
 ! conflict when both module A and B are later accessed.
 
 #if STEP == 1
-module modfile75a
+module modfile76a
   integer, bind(c) :: x
 end
 
-module modfile75b
-  use modfile75a ! capture hermetically
+module modfile76b
+  use modfile76a ! capture hermetically
 end
 
 #else
 subroutine test
-  use modfile75a
-  use modfile75b
+  use modfile76a
+  use modfile76b
   implicit none
   print *, x
 end subroutine
diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90
index a82904ebbcc22..9ad615c16c43c 100644
--- a/flang/test/Semantics/modfile77.F90
+++ b/flang/test/Semantics/modfile77.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile77c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile77a
diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90
index cb3eccd9a4108..19b9ac39de934 100644
--- a/flang/test/Semantics/modfile78.F90
+++ b/flang/test/Semantics/modfile78.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile78c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile78a
diff --git a/flang/test/Semantics/modfile79.F90 b/flang/test/Semantics/modfile79.F90
new file mode 100644
index 0000000000000..ae156527b3bf3
--- /dev/null
+++ b/flang/test/Semantics/modfile79.F90
@@ -0,0 +1,34 @@
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -DWHICH=1 -J%t %s && FileCheck %s <%t/modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -J%t %s && FileCheck %s <%t/modfile79a.mod
+
+!Ensure that writing modfile79c.mod doesn't cause a spurious
+!regeneration of modfile79a.mod from its copy in the hermetic
+!module file modfile79b.mod.
+!CHECK: !mod$ v1 sum:93ec75fe672c5b6c
+!CHECK-NEXT: module modfile79a
+
+#if WHICH == 1
+module modfile79a
+  interface foo
+    module procedure foo
+  end interface
+ contains
+  subroutine foo
+  end
+end
+#elif WHICH == 2
+module modfile79b
+  use modfile79a
+  interface bar
+    procedure foo
+  end interface
+end
+#else
+module modfile79c
+  use modfile79b
+ contains
+  subroutine test
+    call bar
+  end
+end
+#endif
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index d228cd2a84ca4..bb20c546e0261 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ subroutine s2(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ subroutine s2(x, y)
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,8 +155,8 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
 module m09
@@ -197,8 +197,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index f0c0a817da4a4..e2552d0a21d6f 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..94dd2199db35a 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index 2a7f12a153eb8..df1aecf3821de 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 2385709a8eb44..22f37b1a4369d 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index e8766d9811db8..ab20d6f601106 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 689cf469dee3b..391a66f3d6664 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ module m
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 92efc8f9ea54b..08e0b95abb763 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..6b23b63d28b1d
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,67 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!Check "nodefinedassignment" settings.
+
+module m01
+
+  type hasAsst1
+   contains
+    procedure asst1
+    generic :: assignment(=) => asst1
+  end type
+!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type hasAsst2 ! no defined assignment relevant to the runtime
+  end type
+  interface assignment(=)
+    procedure asst2
+  end interface
+!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test1
+    type(hasAsst1) c
+  end type
+!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type test2
+    type(hasAsst2) c
+  end type
+!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test3
+    type(hasAsst1), pointer :: p
+  end type
+!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test4
+    type(hasAsst2), pointer :: p
+  end type
+!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type, extends(hasAsst1) :: test5
+  end type
+!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type, extends(hasAsst2) :: test6
+  end type
+!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test7
+    type(test7), allocatable :: c
+  end type
+!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test8
+    class(test8), allocatable :: c
+  end type
+!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+
+ contains
+  impure elemental subroutine asst1(left, right)
+    class(hasAsst1), intent(out) :: left
+    class(hasAsst1), intent(in) :: right
+  end
+  impure elemental subroutine asst2(left, right)
+    class(hasAsst2), intent(out) :: left
+    class(hasAsst2), intent(in) :: right
+  end
+end
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
index cf4abf9e38181..ad824ad3590a2 100644
--- a/flang/test/Semantics/typeinfo13.f90
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -22,5 +22,5 @@ impure elemental subroutine override(to, from)
   end
 end
 
-!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)]
 !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]
diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h
index 31de764658f9e..4d71e3dd89098 100644
--- a/libc/hdr/types/char8_t.h
+++ b/libc/hdr/types/char8_t.h
@@ -9,14 +9,6 @@
 #ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 #define LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 
-#ifdef LIBC_FULL_BUILD
-
 #include "include/llvm-libc-types/char8_t.h"
 
-#else // overlay mode
-
-#include "hdr/uchar_overlay.h"
-
-#endif // LLVM_LIBC_FULL_BUILD
-
 #endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 6c3e1520e5aff..4c77d3c541cdf 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -465,7 +465,10 @@ template <size_t Bits> struct DyadicFloat {
         // exponents coming in to this function _shouldn't_ be that large). The
         // result should always end up as a positive size_t.
         size_t shift = -static_cast<size_t>(exponent);
-        new_mant >>= shift;
+        if (shift >= Bits)
+          new_mant = 0;
+        else
+          new_mant >>= shift;
         round_dir = rounding_direction(mantissa, shift, sign);
         if (round_dir > 0)
           ++new_mant;
diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
index 3c487e4f29264..a1de0680cc7d5 100644
--- a/libc/src/__support/HashTable/CMakeLists.txt
+++ b/libc/src/__support/HashTable/CMakeLists.txt
@@ -32,9 +32,8 @@ add_header_library(
     libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
     libc.src.__support.memory_size
-    libc.src.string.memset
-    libc.src.string.strcmp
-    libc.src.string.strlen
+    libc.src.string.memory_utils.inline_strcmp
+    libc.src.string.string_utils
 )
 
 add_header_library(
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 13badb90dbfde..10dd9711afbf6 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -18,9 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/memory_size.h"
-#include "src/string/memset.h"
-#include "src/string/strcmp.h"
-#include "src/string/strlen.h"
+#include "src/string/memory_utils/inline_strcmp.h"
+#include "src/string/string_utils.h"
 #include <stddef.h>
 #include <stdint.h>
 
@@ -158,7 +157,9 @@ struct HashTable {
       for (size_t i : masks) {
         size_t index = (pos + i) & entries_mask;
         ENTRY &entry = this->entry(index);
-        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
+        auto comp = [](char l, char r) -> int { return l - r; };
+        if (LIBC_LIKELY(entry.key != nullptr &&
+                        inline_strcmp(entry.key, key, comp) == 0))
           return index;
       }
       BitMask available = ctrls.mask_available();
@@ -176,7 +177,7 @@ struct HashTable {
 
   LIBC_INLINE uint64_t oneshot_hash(const char *key) const {
     LIBC_NAMESPACE::internal::HashState hasher = state;
-    hasher.update(key, strlen(key));
+    hasher.update(key, internal::string_length(key));
     return hasher.finish();
   }
 
@@ -282,8 +283,8 @@ struct HashTable {
       table->entries_mask = entries - 1u;
       table->available_slots = entries / 8 * 7;
       table->state = HashState{randomness};
-      memset(&table->control(0), 0x80, ctrl_sizes);
-      memset(mem, 0, table->offset_from_entries());
+      __builtin_memset(&table->control(0), 0x80, ctrl_sizes);
+      __builtin_memset(mem, 0, table->offset_from_entries());
     }
     return table;
   }
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 5cca58400ff45..6715e354e23e5 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -15,12 +15,7 @@ add_object_library(
   DEPENDS
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.src.__support.error_or
+    libc.src.__support.math_extras
     .mbstate
-    .utf_ret
-)
-
-add_header_library(
-  utf_ret
-  HDRS
-    utf_ret.h
 )
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..5ab0447bb08b2 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,27 +8,142 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 #include "character_converter.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
+// This is for utf-8 bytes other than the first byte
+constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+// The number of bits per utf-8 byte that actually encode character
+// Information not metadata (# of bits excluding the byte headers)
+constexpr uint32_t MASK_ENCODED_BITS =
+    mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
+void CharacterConverter::clear() {
+  state->partial = 0;
+  state->bytes_processed = 0;
+  state->total_bytes = 0;
+}
+
 bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0) {
+    // UTF-8 char has 1 byte total
+    if (num_ones == 0) {
+      state->total_bytes = 1;
+    }
+    // UTF-8 char has 2 through 4 bytes total
+    else if (num_ones >= 2 && num_ones <= 4) {
+      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+      we will make the base mask with 7 ones and right shift it as necessary. */
+      constexpr size_t SIGNIFICANT_BITS = 7;
+      char8_t base_mask =
+          static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
+      state->total_bytes = num_ones;
+      utf8_byte &= (base_mask >> num_ones);
+    }
+    // Invalid first byte
+    else {
+      // bytes_processed and total_bytes will always be 0 here
+      state->partial = static_cast<char32_t>(0);
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  if (num_ones == 1 && !isComplete()) {
+    char32_t byte = utf8_byte & MASK_ENCODED_BITS;
+    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset the state
+  clear();
+  return -1;
+}
+
+int CharacterConverter::push(char32_t utf32) {
+  // we can't be partially through a conversion when pushing a utf32 value
+  if (!isComplete())
+    return -1;
+
+  state->partial = utf32;
+  state->bytes_processed = 0;
+
+  // determine number of utf-8 bytes needed to represent this utf32 value
+  constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  constexpr int NUM_RANGES = 4;
+  for (uint8_t i = 0; i < NUM_RANGES; i++) {
+    if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
+      state->total_bytes = i + 1;
+      return 0;
+    }
+  }
+
+  // `utf32` contains a value that is too large to actually represent a valid
+  // unicode character
+  clear();
+  return -1;
+}
+
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  // If pop is called too early, do not reset the state, use error to determine
+  // whether enough bytes have been pushed
+  if (!isComplete() || state->bytes_processed == 0)
+    return Error(-1);
+  char32_t utf32 = state->partial;
+  // reset if successful pop
+  clear();
+  return utf32;
+}
 
-int CharacterConverter::push(char32_t utf32) {}
+ErrorOr<char8_t> CharacterConverter::pop_utf8() {
+  if (isComplete())
+    return Error(-1);
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+  constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
+  constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+  char32_t output;
+
+  // Shift to get the next 6 bits from the utf32 encoding
+  const size_t shift_amount =
+      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
+  if (state->bytes_processed == 0) {
+    /*
+      Choose the correct set of most significant bits to encode the length
+      of the utf8 sequence. The remaining bits contain the most significant
+      bits of the unicode value of the character.
+    */
+    output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
+             (state->partial >> shift_amount);
+  } else {
+    // Get the next 6 bits and format it like so: 10xxxxxx
+    output = CONTINUING_BYTE_HEADER |
+             ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
+  }
+
+  state->bytes_processed++;
+  return static_cast<char8_t>(output);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe22..c4ba7cf6b689f 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,8 +11,9 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -24,13 +25,14 @@ class CharacterConverter {
 public:
   CharacterConverter(mbstate *mbstate);
 
+  void clear();
   bool isComplete();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
-  utf_ret<char8_t> pop_utf8();
-  utf_ret<char32_t> pop_utf32();
+  ErrorOr<char8_t> pop_utf8();
+  ErrorOr<char32_t> pop_utf32();
 };
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index d33ee354a5443..fb08fb4eaa188 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 struct mbstate {
+  // store a partial codepoint (in UTF-32)
   char32_t partial;
+
+  /*
+  Progress towards a conversion
+    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
+    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+  */
   uint8_t bytes_processed;
+
+  // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
 };
 
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
deleted file mode 100644
index fa99b76159bd8..0000000000000
--- a/libc/src/__support/wchar/utf_ret.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-template <typename T> struct utf_ret {
-  T out;
-  int error;
-};
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index c94698ec02953..7253c6549a4e4 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -35,11 +35,11 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);
diff --git a/libc/src/stdio/baremetal/putchar.cpp b/libc/src/stdio/baremetal/putchar.cpp
index 0ba46a5ade6c9..ac21e6e783b01 100644
--- a/libc/src/stdio/baremetal/putchar.cpp
+++ b/libc/src/stdio/baremetal/putchar.cpp
@@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, putchar, (int c)) {
   char uc = static_cast<char>(c);
 
-  write_to_stderr(cpp::string_view(&uc, 1));
+  write_to_stdout(cpp::string_view(&uc, 1));
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/puts.cpp b/libc/src/stdio/baremetal/puts.cpp
index 5062efda1c0dc..fcd3aa086b2bf 100644
--- a/libc/src/stdio/baremetal/puts.cpp
+++ b/libc/src/stdio/baremetal/puts.cpp
@@ -17,8 +17,8 @@ LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
   cpp::string_view str_view(str);
 
   // TODO: Can we combine these to avoid needing two writes?
-  write_to_stderr(str_view);
-  write_to_stderr("\n");
+  write_to_stdout(str_view);
+  write_to_stdout("\n");
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index 3e8631abd90d9..ab02533f14911 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -33,11 +33,11 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..9f626ed31cc07 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()
diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp
index a579bfabb2d7b..ba9849b6b5af9 100644
--- a/libc/test/src/__support/HashTable/table_test.cpp
+++ b/libc/test/src/__support/HashTable/table_test.cpp
@@ -108,7 +108,9 @@ TEST(LlvmLibcTableTest, Insertion) {
             static_cast<void *>(keys[CAP].bytes));
 
   for (size_t i = 0; i <= CAP; ++i) {
-    ASSERT_EQ(strcmp(table->find(keys[i].bytes)->key, keys[i].bytes), 0);
+    auto comp = [](char l, char r) -> int { return l - r; };
+    ASSERT_EQ(
+        inline_strcmp(table->find(keys[i].bytes)->key, keys[i].bytes, comp), 0);
   }
   for (size_t i = CAP + 1; i < 256; ++i) {
     ASSERT_EQ(table->find(keys[i].bytes), static_cast<ENTRY *>(nullptr));
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..5176bfd4b024b
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,21 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
+
+add_libc_test(
+  utf32_to_8_test
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 0000000000000..f4c5cb863ff38
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,180 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // utf8 1-byte encodings are identical to their utf32 representations
+  char32_t utf32_A = 0x41; // 'A'
+  cr.push(utf32_A);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'A');
+  ASSERT_TRUE(cr.isComplete());
+
+  char32_t utf32_B = 0x42; // 'B'
+  cr.push(utf32_B);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'B');
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  char32_t utf32 = 0xff;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x58e -> utf8: 0xd6 0x8e
+  utf32 = 0x58e;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  char32_t utf32 = 0x1f921;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  utf32 = 0x12121;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  char32_t utf32 = 0x12121;
+  ASSERT_EQ(cr.push(utf32), 0);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+
+  // can't push a utf32 without finishing popping the utf8 bytes out
+  int err = cr.push(utf32);
+  ASSERT_EQ(err, -1);
+}
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..9cb059faa9374
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first and third bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to push another should error.
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  // Should produce an error since mbstate was reset
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+  // Second two byte character
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index ce2171f19597b..4aa8b95880018 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -488,6 +495,8 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -510,6 +519,8 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -527,6 +538,8 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100e..b53184c30be36 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a8..be2e50271b510 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,15 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -27,29 +30,28 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82b..bef9dafd3d87c 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,15 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,31 +33,30 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d4052939..8fc38b0659181 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,14 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -29,15 +32,16 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+              Succeeds(WRITE_SIZE));
   // This is a write-only file so reads should fail.
-  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
+  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file),
+              Fails(EBADF, static_cast<char *>(nullptr)));
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -55,6 +59,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -86,5 +91,5 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b8..e097785832d56 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b646..bcf5e674141a7 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c0..296bff1f5dc15 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8e..135fb98c07fbb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb41..a0936ba79ef73 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,12 +11,14 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +54,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +104,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064bd..e99b382d12112 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e77..03f0a6539c785 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c3..eb4056dc7ba64 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 38c47e8d45c81..af8a297fdf3fd 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -505,13 +505,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 _LIBCPP_HIDE_FROM_ABI
 #  endif
 
-// TODO: Remove this workaround once we drop support for Clang 16
-#  if __has_warning("-Wc++23-extensions")
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")
-#  else
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++2b-extensions")
-#  endif
-
 // Clang modules take a significant compile time hit when pushing and popping diagnostics.
 // Since all the headers are marked as system headers unless _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER is defined, we can
 // simply disable this pushing and popping when _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER isn't defined.
@@ -522,7 +515,7 @@ typedef __char32_t char32_t;
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                           \
-      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION                                                                 \
+      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")                                                           \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                             \
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index ab390aafa0d9d..dbacbce044766 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -209,21 +209,12 @@ struct pair
 #  endif
 
 #  if _LIBCPP_STD_VER >= 23
-  // TODO: Remove this workaround in LLVM 20. The bug got fixed in Clang 18.
-  // This is a workaround for http://llvm.org/PR60710. We should be able to remove it once Clang is fixed.
-  template <class _PairLike>
-  _LIBCPP_HIDE_FROM_ABI static constexpr bool __pair_like_explicit_wknd() {
-    if constexpr (__pair_like_no_subrange<_PairLike>) {
-      return !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
-             !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>;
-    }
-    return false;
-  }
-
   template <__pair_like_no_subrange _PairLike>
     requires(is_constructible_v<first_type, decltype(std::get<0>(std::declval<_PairLike &&>()))> &&
              is_constructible_v<second_type, decltype(std::get<1>(std::declval<_PairLike &&>()))>)
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit(__pair_like_explicit_wknd<_PairLike>()) pair(_PairLike&& __p)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit(
+      !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
+      !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>) pair(_PairLike&& __p)
       : first(std::get<0>(std::forward<_PairLike>(__p))), second(std::get<1>(std::forward<_PairLike>(__p))) {}
 #  endif
 
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 71c4957b691a6..00aa00ff7e9cd 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -696,7 +696,7 @@ basic_filebuf<_CharT, _Traits>* basic_filebuf<_CharT, _Traits>::open(const char*
   if (!__mdstr)
     return nullptr;
 
-  return __do_open(fopen(__s, __mdstr), __mode);
+  return __do_open(std::fopen(__s, __mdstr), __mode);
 }
 
 template <class _CharT, class _Traits>
@@ -761,7 +761,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     std::memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
     if (__always_noconv_) {
       size_t __nmemb = static_cast<size_t>(this->egptr() - this->eback() - __unget_sz);
-      __nmemb        = ::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
+      __nmemb        = std::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
       if (__nmemb != 0) {
         this->setg(this->eback(), this->eback() + __unget_sz, this->eback() + __unget_sz + __nmemb);
         __c = traits_type::to_int_type(*this->gptr());
@@ -778,7 +778,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
           std::min(static_cast<size_t>(__ibs_ - __unget_sz), static_cast<size_t>(__extbufend_ - __extbufnext_));
       codecvt_base::result __r;
       __st_last_  = __st_;
-      size_t __nr = fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
+      size_t __nr = std::fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
       if (__nr != 0) {
         if (!__cv_)
           std::__throw_bad_cast();
@@ -855,7 +855,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
             return traits_type::eof();
         } else if (__r == codecvt_base::ok || __r == codecvt_base::partial) {
           size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-          if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+          if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
             return traits_type::eof();
           if (__r == codecvt_base::partial) {
             this->setp(const_cast<char_type*>(__e), this->pptr());
@@ -990,12 +990,12 @@ int basic_filebuf<_CharT, _Traits>::sync() {
       char* __extbe;
       __r            = __cv_->unshift(__st_, __extbuf_, __extbuf_ + __ebs_, __extbe);
       size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-      if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+      if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
         return -1;
     } while (__r == codecvt_base::partial);
     if (__r == codecvt_base::error)
       return -1;
-    if (fflush(__file_))
+    if (std::fflush(__file_))
       return -1;
   } else if (__cm_ & ios_base::in) {
     off_type __c;
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index e25647909378e..585ae7af65aa8 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -178,8 +178,8 @@ public:
   // Get and put areas:
   // 27.6.2.2.3 Get area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize in_avail() {
-    if (__ninp_ < __einp_)
-      return static_cast<streamsize>(__einp_ - __ninp_);
+    if (gptr() < egptr())
+      return static_cast<streamsize>(egptr() - gptr());
     return showmanyc();
   }
 
@@ -190,37 +190,42 @@ public:
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sbumpc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return uflow();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sgetc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return underflow();
-    return traits_type::to_int_type(*__ninp_);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize sgetn(char_type* __s, streamsize __n) { return xsgetn(__s, __n); }
 
   // 27.6.2.2.4 Putback:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputbackc(char_type __c) {
-    if (__binp_ == __ninp_ || !traits_type::eq(__c, __ninp_[-1]))
+    if (eback() == gptr() || !traits_type::eq(__c, *(gptr() - 1)))
       return pbackfail(traits_type::to_int_type(__c));
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sungetc() {
-    if (__binp_ == __ninp_)
+    if (eback() == gptr())
       return pbackfail();
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   // 27.6.2.2.5 Put area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputc(char_type __c) {
-    if (__nout_ == __eout_)
+    if (pptr() == epptr())
       return overflow(traits_type::to_int_type(__c));
-    *__nout_++ = __c;
+    *pptr() = __c;
+    this->pbump(1);
     return traits_type::to_int_type(__c);
   }
 
@@ -312,17 +317,16 @@ protected:
   virtual streamsize showmanyc() { return 0; }
 
   virtual streamsize xsgetn(char_type* __s, streamsize __n) {
-    const int_type __eof = traits_type::eof();
     int_type __c;
     streamsize __i = 0;
     while (__i < __n) {
-      if (__ninp_ < __einp_) {
-        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(__einp_ - __ninp_, __n - __i));
-        traits_type::copy(__s, __ninp_, __len);
+      if (gptr() < egptr()) {
+        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(egptr() - gptr(), __n - __i));
+        traits_type::copy(__s, gptr(), __len);
         __s += __len;
         __i += __len;
         this->gbump(__len);
-      } else if ((__c = uflow()) != __eof) {
+      } else if ((__c = uflow()) != traits_type::eof()) {
         *__s = traits_type::to_char_type(__c);
         ++__s;
         ++__i;
@@ -336,7 +340,9 @@ protected:
   virtual int_type uflow() {
     if (underflow() == traits_type::eof())
       return traits_type::eof();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   // 27.6.2.4.4 Putback:
@@ -345,17 +351,16 @@ protected:
   // 27.6.2.4.5 Put area:
   virtual streamsize xsputn(const char_type* __s, streamsize __n) {
     streamsize __i = 0;
-    int_type __eof = traits_type::eof();
     while (__i < __n) {
-      if (__nout_ >= __eout_) {
-        if (overflow(traits_type::to_int_type(*__s)) == __eof)
+      if (pptr() >= epptr()) {
+        if (overflow(traits_type::to_int_type(*__s)) == traits_type::eof())
           break;
         ++__s;
         ++__i;
       } else {
-        streamsize __chunk_size = std::min(__eout_ - __nout_, __n - __i);
-        traits_type::copy(__nout_, __s, __chunk_size);
-        __nout_ += __chunk_size;
+        streamsize __chunk_size = std::min(epptr() - pptr(), __n - __i);
+        traits_type::copy(pptr(), __s, __chunk_size);
+        __pbump(__chunk_size);
         __s += __chunk_size;
         __i += __chunk_size;
       }
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
index becf89b12fdd1..973d744a1da44 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
index abd42272de508..5aaad2738d325 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>
diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp
index f7f6be049f0e1..a3b722f13daca 100644
--- a/lld/MachO/DriverUtils.cpp
+++ b/lld/MachO/DriverUtils.cpp
@@ -225,14 +225,21 @@ std::optional<StringRef> macho::resolveDylibPath(StringRef dylibPath) {
 // especially if it's a commonly re-exported core library.
 static DenseMap<CachedHashStringRef, DylibFile *> loadedDylibs;
 
+static StringRef realPathIfDifferent(StringRef path) {
+  SmallString<128> realPathBuf;
+  if (fs::real_path(path, realPathBuf))
+    return StringRef();
+
+  SmallString<128> absPathBuf = path;
+  if (!fs::make_absolute(absPathBuf) && realPathBuf == absPathBuf)
+    return StringRef();
+
+  return uniqueSaver().save(StringRef(realPathBuf));
+}
+
 DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
                             bool isBundleLoader, bool explicitlyLinked) {
-  // Frameworks can be found from different symlink paths, so resolve
-  // symlinks before looking up in the dylib cache.
-  SmallString<128> realPath;
-  std::error_code err = fs::real_path(mbref.getBufferIdentifier(), realPath);
-  CachedHashStringRef path(!err ? uniqueSaver().save(StringRef(realPath))
-                                : mbref.getBufferIdentifier());
+  CachedHashStringRef path(mbref.getBufferIdentifier());
   DylibFile *&file = loadedDylibs[path];
   if (file) {
     if (explicitlyLinked)
@@ -240,6 +247,22 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
     return file;
   }
 
+  // Frameworks can be found from different symlink paths, so resolve
+  // symlinks and look up in the dylib cache.
+  CachedHashStringRef realPath(
+      realPathIfDifferent(mbref.getBufferIdentifier()));
+  if (!realPath.val().empty()) {
+    // Avoid map insertions here so that we do not invalidate the "file"
+    // reference.
+    auto it = loadedDylibs.find(realPath);
+    if (it != loadedDylibs.end()) {
+      DylibFile *realfile = it->second;
+      if (explicitlyLinked)
+        realfile->setExplicitlyLinked();
+      return realfile;
+    }
+  }
+
   DylibFile *newFile;
   file_magic magic = identify_magic(mbref.getBuffer());
   if (magic == file_magic::tapi_file) {
@@ -292,6 +315,11 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
             sys::path::filename(newFile->installName) + "' because " +
             config->clientName + " is not an allowed client");
   }
+
+  // If the load path was a symlink, cache the real path too.
+  if (!realPath.val().empty())
+    loadedDylibs[realPath] = newFile;
+
   return newFile;
 }
 
diff --git a/lld/test/ELF/lto/aarch64-pac-got-func.ll b/lld/test/ELF/lto/aarch64-pac-got-func.ll
index a37c67a2f3ba8..0baa3559a6f90 100644
--- a/lld/test/ELF/lto/aarch64-pac-got-func.ll
+++ b/lld/test/ELF/lto/aarch64-pac-got-func.ll
@@ -5,29 +5,29 @@
 ; RUN: llvm-readelf -r -x.got %t | FileCheck %s
 
 ; CHECK:      Relocation section '.rela.dyn' at offset 0x3d0 contains 8 entries:
-; CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-; CHECK-NEXT: 00000000000206a0  0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
-; CHECK-NEXT: 00000000000206a8  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
-; CHECK-NEXT: 00000000000206b0  0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
-; CHECK-NEXT: 00000000000206b8  0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
-; CHECK-NEXT: 00000000000206c0  0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
-; CHECK-NEXT: 00000000000206c8  0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
-; CHECK-NEXT: 0000000000020690  0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010490 func + 0
-; CHECK-NEXT: 0000000000020698  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 00000000000306d0 var + 0
+; CHECK-NEXT:     Offset                Info             Type               Symbol's Value  Symbol's Name + Addend
+; CHECK-NEXT: [[#%x,ADDR:]]        0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x8]]  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x10]] 0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x18]] 0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x20]] 0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x28]] 0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x10]] 0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010800 func + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x8]]  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 0000000000031400 var + 0
 
 ; CHECK:      Hex dump of section '.got':
-; CHECK-NEXT: 0x00020690 00000000 00000080 00000000 000000a0
-;;                                      ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206a0 00000000 00000080 00000000 000000a0
-;;                                      ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206b0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206c0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR-0x10]] 00000000 00000080 00000000 000000a0
+;;                                                  ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR]]      00000000 00000080 00000000 000000a0
+;;                                                  ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x10]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x20]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -37,16 +37,16 @@ target triple = "aarch64-unknown-linux-gnu"
 @g3 = external global ptr
 @g4 = external global ptr
 
-define void @func() {
+define void @func() align 1024 {
 entry:
   ret void
 }
 declare void @func_undef()
 
-@var = global i32 42
+@var = global i32 42, align 1024
 @var_undef = external global i32
 
-define void @bar() #0 {
+define void @bar() #0 align 1024 {
 entry:
   store ptr ptrauth (ptr @func, i32 0), ptr @g1
   store ptr ptrauth (ptr @func_undef, i32 0), ptr @g2
@@ -55,7 +55,7 @@ entry:
   ret void
 }
 
-define void @_start() {
+define void @_start() align 1024 {
 entry:
   ret void
 }
diff --git a/lld/test/MachO/reexport-with-symlink.s b/lld/test/MachO/reexport-with-symlink.s
new file mode 100644
index 0000000000000..a6b5992713f39
--- /dev/null
+++ b/lld/test/MachO/reexport-with-symlink.s
@@ -0,0 +1,74 @@
+# REQUIRES: aarch64, shell
+# RUN: rm -rf %t; split-file %s %t
+# RUN: ln -s Versions/A/Developer %t/Developer/Library/Frameworks/Developer.framework/
+# RUN: llvm-mc -filetype obj -triple arm64-apple-macos11.0 %t/test.s -o %t/test.o
+# RUN: %lld -arch arm64 -platform_version macos 11.0 11.0 -o %t/test -framework Developer -F %t/Developer/Library/Frameworks -L %t/Developer/usr/lib %t/test.o -t | FileCheck %s
+
+# CHECK: {{.*}}/Developer/Library/Frameworks/Developer.framework/Developer
+# CHECK: {{.*}}/Developer/usr/lib/libDeveloperSupport.tbd(@rpath/libDeveloperSupport.dylib)
+# CHECK-NOT: {{.*}}/Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+
+#--- Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/Developer.framework/Developer"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcPublic"]
+        }
+      }
+    ]
+  }
+}
+#--- Developer/usr/lib/libDeveloperSupport.tbd
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libDeveloperSupport.dylib"
+      }
+    ],
+    "reexported_libraries": [
+      {
+        "names": [
+          "@rpath/Developer.framework/Versions/A/Developer"
+        ]
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcSupport"]
+        }
+      }
+    ]
+  }
+}
+#--- test.s
+.text
+.globl _main
+.linker_option "-lDeveloperSupport"
+
+_main:
+  ret
+
+.data
+  .quad _funcPublic
+  .quad _funcSupport
diff --git a/lldb/cmake/modules/FindPythonAndSwig.cmake b/lldb/cmake/modules/FindPythonAndSwig.cmake
index 1f6f553e86048..b478038f144d9 100644
--- a/lldb/cmake/modules/FindPythonAndSwig.cmake
+++ b/lldb/cmake/modules/FindPythonAndSwig.cmake
@@ -6,7 +6,9 @@
 
 macro(FindPython3)
   # Use PYTHON_HOME as a hint to find Python 3.
-  set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  if(NOT Python3_ROOT_DIR)
+    set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  endif()
   find_package(Python3 COMPONENTS Interpreter Development)
   if(Python3_FOUND AND Python3_Interpreter_FOUND)
 
diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index e7b1691031111..1d7c976f3c382 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -236,12 +236,6 @@ class PluginManager {
   static SystemRuntimeCreateInstance
   GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
-
-  // Modify the enabled state of a SystemRuntime plugin.
-  // Returns false if the plugin name is not found.
-  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enabled);
-
   // ObjectFile
   static bool
   RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
@@ -549,12 +543,6 @@ class PluginManager {
   static InstrumentationRuntimeCreateInstance
   GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo>
-  GetInstrumentationRuntimePluginInfo();
-
-  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                     bool enabled);
-
   // TypeSystem
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              TypeSystemCreateInstance create_callback,
@@ -690,6 +678,102 @@ class PluginManager {
   static bool CreateSettingForCPlusPlusLanguagePlugin(
       Debugger &debugger, const lldb::OptionValuePropertiesSP &properties_sp,
       llvm::StringRef description, bool is_global_property);
+
+  //
+  // Plugin Info+Enable Declarations
+  //
+  static std::vector<RegisteredPluginInfo> GetABIPluginInfo();
+  static bool SetABIPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetArchitecturePluginInfo();
+  static bool SetArchitecturePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDisassemblerPluginInfo();
+  static bool SetDisassemblerPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDynamicLoaderPluginInfo();
+  static bool SetDynamicLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetEmulateInstructionPluginInfo();
+  static bool SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                 bool enable);
+
+  static std::vector<RegisteredPluginInfo>
+  GetInstrumentationRuntimePluginInfo();
+  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                     bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetJITLoaderPluginInfo();
+  static bool SetJITLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguagePluginInfo();
+  static bool SetLanguagePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguageRuntimePluginInfo();
+  static bool SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetMemoryHistoryPluginInfo();
+  static bool SetMemoryHistoryPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectContainerPluginInfo();
+  static bool SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectFilePluginInfo();
+  static bool SetObjectFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetOperatingSystemPluginInfo();
+  static bool SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetPlatformPluginInfo();
+  static bool SetPlatformPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetProcessPluginInfo();
+  static bool SetProcessPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetREPLPluginInfo();
+  static bool SetREPLPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetRegisterTypeBuilderPluginInfo();
+  static bool SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                  bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptInterpreterPluginInfo();
+  static bool SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptedInterfacePluginInfo();
+  static bool SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetStructuredDataPluginInfo();
+  static bool SetStructuredDataPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolFilePluginInfo();
+  static bool SetSymbolFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolLocatorPluginInfo();
+  static bool SetSymbolLocatorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolVendorPluginInfo();
+  static bool SetSymbolVendorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
+  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTracePluginInfo();
+  static bool SetTracePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTraceExporterPluginInfo();
+  static bool SetTraceExporterPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTypeSystemPluginInfo();
+  static bool SetTypeSystemPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetUnwindAssemblyPluginInfo();
+  static bool SetUnwindAssemblyPluginEnabled(llvm::StringRef name, bool enable);
 };
 
 } // namespace lldb_private
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 9786678aa53f9..6d32491eaa5e9 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -494,7 +494,7 @@ def wait_for_terminated(self, timeout: Optional[float] = None):
             raise ValueError("didn't get terminated event")
         return event_dict
 
-    def get_capability(self, key):
+    def get_capability(self, key: str):
         """Get a value for the given key if it there is a key/value pair in
         the capabilities reported by the adapter.
         """
@@ -1050,8 +1050,12 @@ def request_setBreakpoints(self, source: Source, line_array, data=None):
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
-    def request_setExceptionBreakpoints(self, filters):
+    def request_setExceptionBreakpoints(
+        self, *, filters: list[str] = [], filter_options: list[dict] = []
+    ):
         args_dict = {"filters": filters}
+        if filter_options:
+            args_dict["filterOptions"] = filter_options
         command_dict = {
             "command": "setExceptionBreakpoints",
             "type": "request",
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index a2aaa2065dfd7..d2ed92f8b0c97 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -239,6 +239,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
   ModuleSpec module_spec;
   module_spec.GetUUID() = uuid;
   FileSpec name_filespec(name);
+  if (FileSystem::Instance().Exists(name_filespec))
+    module_spec.GetFileSpec() = name_filespec;
 
   if (uuid.IsValid()) {
     Progress progress("Locating binary", prog_str.GetString().str());
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 5d44434033c55..dfa865929b64f 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -182,14 +182,176 @@ void PluginManager::Terminate() {
 }
 
 llvm::ArrayRef<PluginNamespace> PluginManager::GetPluginNamespaces() {
-  // Currently supported set of plugin namespaces. This will be expanded
-  // over time.
   static PluginNamespace PluginNamespaces[] = {
-      {"system-runtime", PluginManager::GetSystemRuntimePluginInfo,
-       PluginManager::SetSystemRuntimePluginEnabled},
-      {"instrumentation-runtime",
-       PluginManager::GetInstrumentationRuntimePluginInfo,
-       PluginManager::SetInstrumentationRuntimePluginEnabled}};
+
+      {
+          "abi",
+          PluginManager::GetABIPluginInfo,
+          PluginManager::SetABIPluginEnabled,
+      },
+
+      {
+          "architecture",
+          PluginManager::GetArchitecturePluginInfo,
+          PluginManager::SetArchitecturePluginEnabled,
+      },
+
+      {
+          "disassembler",
+          PluginManager::GetDisassemblerPluginInfo,
+          PluginManager::SetDisassemblerPluginEnabled,
+      },
+
+      {
+          "dynamic-loader",
+          PluginManager::GetDynamicLoaderPluginInfo,
+          PluginManager::SetDynamicLoaderPluginEnabled,
+      },
+
+      {
+          "emulate-instruction",
+          PluginManager::GetEmulateInstructionPluginInfo,
+          PluginManager::SetEmulateInstructionPluginEnabled,
+      },
+
+      {
+          "instrumentation-runtime",
+          PluginManager::GetInstrumentationRuntimePluginInfo,
+          PluginManager::SetInstrumentationRuntimePluginEnabled,
+      },
+
+      {
+          "jit-loader",
+          PluginManager::GetJITLoaderPluginInfo,
+          PluginManager::SetJITLoaderPluginEnabled,
+      },
+
+      {
+          "language",
+          PluginManager::GetLanguagePluginInfo,
+          PluginManager::SetLanguagePluginEnabled,
+      },
+
+      {
+          "language-runtime",
+          PluginManager::GetLanguageRuntimePluginInfo,
+          PluginManager::SetLanguageRuntimePluginEnabled,
+      },
+
+      {
+          "memory-history",
+          PluginManager::GetMemoryHistoryPluginInfo,
+          PluginManager::SetMemoryHistoryPluginEnabled,
+      },
+
+      {
+          "object-container",
+          PluginManager::GetObjectContainerPluginInfo,
+          PluginManager::SetObjectContainerPluginEnabled,
+      },
+
+      {
+          "object-file",
+          PluginManager::GetObjectFilePluginInfo,
+          PluginManager::SetObjectFilePluginEnabled,
+      },
+
+      {
+          "operating-system",
+          PluginManager::GetOperatingSystemPluginInfo,
+          PluginManager::SetOperatingSystemPluginEnabled,
+      },
+
+      {
+          "platform",
+          PluginManager::GetPlatformPluginInfo,
+          PluginManager::SetPlatformPluginEnabled,
+      },
+
+      {
+          "process",
+          PluginManager::GetProcessPluginInfo,
+          PluginManager::SetProcessPluginEnabled,
+      },
+
+      {
+          "repl",
+          PluginManager::GetREPLPluginInfo,
+          PluginManager::SetREPLPluginEnabled,
+      },
+
+      {
+          "register-type-builder",
+          PluginManager::GetRegisterTypeBuilderPluginInfo,
+          PluginManager::SetRegisterTypeBuilderPluginEnabled,
+      },
+
+      {
+          "script-interpreter",
+          PluginManager::GetScriptInterpreterPluginInfo,
+          PluginManager::SetScriptInterpreterPluginEnabled,
+      },
+
+      {
+          "scripted-interface",
+          PluginManager::GetScriptedInterfacePluginInfo,
+          PluginManager::SetScriptedInterfacePluginEnabled,
+      },
+
+      {
+          "structured-data",
+          PluginManager::GetStructuredDataPluginInfo,
+          PluginManager::SetStructuredDataPluginEnabled,
+      },
+
+      {
+          "symbol-file",
+          PluginManager::GetSymbolFilePluginInfo,
+          PluginManager::SetSymbolFilePluginEnabled,
+      },
+
+      {
+          "symbol-locator",
+          PluginManager::GetSymbolLocatorPluginInfo,
+          PluginManager::SetSymbolLocatorPluginEnabled,
+      },
+
+      {
+          "symbol-vendor",
+          PluginManager::GetSymbolVendorPluginInfo,
+          PluginManager::SetSymbolVendorPluginEnabled,
+      },
+
+      {
+          "system-runtime",
+          PluginManager::GetSystemRuntimePluginInfo,
+          PluginManager::SetSystemRuntimePluginEnabled,
+      },
+
+      {
+          "trace",
+          PluginManager::GetTracePluginInfo,
+          PluginManager::SetTracePluginEnabled,
+      },
+
+      {
+          "trace-exporter",
+          PluginManager::GetTraceExporterPluginInfo,
+          PluginManager::SetTraceExporterPluginEnabled,
+      },
+
+      {
+          "type-system",
+          PluginManager::GetTypeSystemPluginInfo,
+          PluginManager::SetTypeSystemPluginEnabled,
+      },
+
+      {
+          "unwind-assembly",
+          PluginManager::GetUnwindAssemblyPluginInfo,
+          PluginManager::SetUnwindAssemblyPluginEnabled,
+      },
+  };
 
   return PluginNamespaces;
 }
@@ -407,7 +569,7 @@ ABICreateInstance PluginManager::GetABICreateCallbackAtIndex(uint32_t idx) {
 #pragma mark Architecture
 
 typedef PluginInstance<ArchitectureCreateInstance> ArchitectureInstance;
-typedef std::vector<ArchitectureInstance> ArchitectureInstances;
+typedef PluginInstances<ArchitectureInstance> ArchitectureInstances;
 
 static ArchitectureInstances &GetArchitectureInstances() {
   static ArchitectureInstances g_instances;
@@ -417,25 +579,18 @@ static ArchitectureInstances &GetArchitectureInstances() {
 void PluginManager::RegisterPlugin(llvm::StringRef name,
                                    llvm::StringRef description,
                                    ArchitectureCreateInstance create_callback) {
-  GetArchitectureInstances().push_back({name, description, create_callback});
+  GetArchitectureInstances().RegisterPlugin(name, description, create_callback);
 }
 
 void PluginManager::UnregisterPlugin(
     ArchitectureCreateInstance create_callback) {
   auto &instances = GetArchitectureInstances();
-
-  for (auto pos = instances.begin(), end = instances.end(); pos != end; ++pos) {
-    if (pos->create_callback == create_callback) {
-      instances.erase(pos);
-      return;
-    }
-  }
-  llvm_unreachable("Plugin not found");
+  instances.UnregisterPlugin(create_callback);
 }
 
 std::unique_ptr<Architecture>
 PluginManager::CreateArchitectureInstance(const ArchSpec &arch) {
-  for (const auto &instances : GetArchitectureInstances()) {
+  for (const auto &instances : GetArchitectureInstances().GetSnapshot()) {
     if (auto plugin_up = instances.create_callback(arch))
       return plugin_up;
   }
@@ -718,15 +873,6 @@ PluginManager::GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetSystemRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
-  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
-                                                  bool enable) {
-  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark ObjectFile
 
 struct ObjectFileInstance : public PluginInstance<ObjectFileCreateInstance> {
@@ -1563,16 +1709,6 @@ PluginManager::GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetInstrumentationRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo>
-PluginManager::GetInstrumentationRuntimePluginInfo() {
-  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                           bool enable) {
-  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark TypeSystem
 
 struct TypeSystemInstance : public PluginInstance<TypeSystemCreateInstance> {
@@ -2057,3 +2193,234 @@ bool PluginManager::CreateSettingForCPlusPlusLanguagePlugin(
                                 "Settings for CPlusPlus language plug-ins",
                                 properties_sp, description, is_global_property);
 }
+
+//
+// Plugin Info+Enable Implementations
+//
+std::vector<RegisteredPluginInfo> PluginManager::GetABIPluginInfo() {
+  return GetABIInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetABIPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetABIInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetArchitecturePluginInfo() {
+  return GetArchitectureInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetArchitecturePluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetArchitectureInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDisassemblerPluginInfo() {
+  return GetDisassemblerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDisassemblerPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetDisassemblerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDynamicLoaderPluginInfo() {
+  return GetDynamicLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDynamicLoaderPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetDynamicLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetEmulateInstructionPluginInfo() {
+  return GetEmulateInstructionInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                       bool enable) {
+  return GetEmulateInstructionInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetInstrumentationRuntimePluginInfo() {
+  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                           bool enable) {
+  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetJITLoaderPluginInfo() {
+  return GetJITLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetJITLoaderPluginEnabled(llvm::StringRef name,
+                                              bool enable) {
+  return GetJITLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetLanguagePluginInfo() {
+  return GetLanguageInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguagePluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetLanguageInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetLanguageRuntimePluginInfo() {
+  return GetLanguageRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetLanguageRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetMemoryHistoryPluginInfo() {
+  return GetMemoryHistoryInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetMemoryHistoryPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetMemoryHistoryInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetObjectContainerPluginInfo() {
+  return GetObjectContainerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetObjectContainerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetObjectFilePluginInfo() {
+  return GetObjectFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetObjectFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetOperatingSystemPluginInfo() {
+  return GetOperatingSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetOperatingSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetPlatformPluginInfo() {
+  return GetPlatformInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetPlatformPluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetPlatformInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetProcessPluginInfo() {
+  return GetProcessInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetProcessPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetProcessInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetREPLPluginInfo() {
+  return GetREPLInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetREPLPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetREPLInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetRegisterTypeBuilderPluginInfo() {
+  return GetRegisterTypeBuilderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                        bool enable) {
+  return GetRegisterTypeBuilderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptInterpreterPluginInfo() {
+  return GetScriptInterpreterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptInterpreterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptedInterfacePluginInfo() {
+  return GetScriptedInterfaceInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptedInterfaceInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetStructuredDataPluginInfo() {
+  return GetStructuredDataPluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetStructuredDataPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetStructuredDataPluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolFilePluginInfo() {
+  return GetSymbolFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetSymbolFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolLocatorPluginInfo() {
+  return GetSymbolLocatorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolLocatorPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSymbolLocatorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolVendorPluginInfo() {
+  return GetSymbolVendorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolVendorPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetSymbolVendorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
+  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTracePluginInfo() {
+  return GetTracePluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTracePluginEnabled(llvm::StringRef name, bool enable) {
+  return GetTracePluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTraceExporterPluginInfo() {
+  return GetTraceExporterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTraceExporterPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetTraceExporterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTypeSystemPluginInfo() {
+  return GetTypeSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTypeSystemPluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetTypeSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetUnwindAssemblyPluginInfo() {
+  return GetUnwindAssemblyInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetUnwindAssemblyPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetUnwindAssemblyInstances().SetInstanceEnabled(name, enable);
+}
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
index c7c292a8a7e42..be17c5421fc51 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
@@ -18,6 +18,7 @@
 
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Target/ExecutionContext.h"
+#include "lldb/Target/Language.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StackFrame.h"
 #include "lldb/Target/Target.h"
@@ -32,36 +33,27 @@ using namespace lldb_private;
 
 static char ID;
 
-#define VALID_POINTER_CHECK_NAME "_$__lldb_valid_pointer_check"
 #define VALID_OBJC_OBJECT_CHECK_NAME "$__lldb_objc_object_check"
 
-static const char g_valid_pointer_check_text[] =
-    "extern \"C\" void\n"
-    "_$__lldb_valid_pointer_check (unsigned char *$__lldb_arg_ptr)\n"
-    "{\n"
-    "    unsigned char $__lldb_local_val = *$__lldb_arg_ptr;\n"
-    "}";
-
 ClangDynamicCheckerFunctions::ClangDynamicCheckerFunctions()
     : DynamicCheckerFunctions(DCF_Clang) {}
 
 ClangDynamicCheckerFunctions::~ClangDynamicCheckerFunctions() = default;
 
-llvm::Error ClangDynamicCheckerFunctions::Install(
-    DiagnosticManager &diagnostic_manager, ExecutionContext &exe_ctx) {
-  Expected<std::unique_ptr<UtilityFunction>> utility_fn =
-      exe_ctx.GetTargetRef().CreateUtilityFunction(
-          g_valid_pointer_check_text, VALID_POINTER_CHECK_NAME,
-          lldb::eLanguageTypeC, exe_ctx);
-  if (!utility_fn)
-    return utility_fn.takeError();
-  m_valid_pointer_check = std::move(*utility_fn);
-
+llvm::Error
+ClangDynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
+                                      ExecutionContext &exe_ctx) {
   if (Process *process = exe_ctx.GetProcessPtr()) {
     ObjCLanguageRuntime *objc_language_runtime =
         ObjCLanguageRuntime::Get(*process);
 
-    if (objc_language_runtime) {
+    SourceLanguage lang = process->GetTarget().GetLanguage();
+    if (!lang)
+      if (auto *frame = exe_ctx.GetFramePtr())
+        lang = frame->GetLanguage();
+
+    if (objc_language_runtime &&
+        Language::LanguageIsObjC(lang.AsLanguageType())) {
       Expected<std::unique_ptr<UtilityFunction>> checker_fn =
           objc_language_runtime->CreateObjectChecker(VALID_OBJC_OBJECT_CHECK_NAME, exe_ctx);
       if (!checker_fn)
@@ -78,11 +70,7 @@ bool ClangDynamicCheckerFunctions::DoCheckersExplainStop(lldb::addr_t addr,
   // FIXME: We have to get the checkers to know why they scotched the call in
   // more detail,
   // so we can print a better message here.
-  if (m_valid_pointer_check && m_valid_pointer_check->ContainsAddress(addr)) {
-    message.Printf("Attempted to dereference an invalid pointer.");
-    return true;
-  } else if (m_objc_object_check &&
-             m_objc_object_check->ContainsAddress(addr)) {
+  if (m_objc_object_check && m_objc_object_check->ContainsAddress(addr)) {
     message.Printf("Attempted to dereference an invalid ObjC Object or send it "
                    "an unrecognized selector");
     return true;
@@ -223,29 +211,6 @@ class Instrumenter {
     return true;
   }
 
-  /// Build a function pointer for a function with signature void
-  /// (*)(uint8_t*) with a given address
-  ///
-  /// \param[in] start_address
-  ///     The address of the function.
-  ///
-  /// \return
-  ///     The function pointer, for use in a CallInst.
-  llvm::FunctionCallee BuildPointerValidatorFunc(lldb::addr_t start_address) {
-    llvm::Type *param_array[1];
-
-    param_array[0] = const_cast<llvm::PointerType *>(GetI8PtrTy());
-
-    ArrayRef<llvm::Type *> params(param_array, 1);
-
-    FunctionType *fun_ty = FunctionType::get(
-        llvm::Type::getVoidTy(m_module.getContext()), params, true);
-    PointerType *fun_ptr_ty = PointerType::getUnqual(m_module.getContext());
-    Constant *fun_addr_int =
-        ConstantInt::get(GetIntptrTy(), start_address, false);
-    return {fun_ty, ConstantExpr::getIntToPtr(fun_addr_int, fun_ptr_ty)};
-  }
-
   /// Build a function pointer for a function with signature void
   /// (*)(uint8_t*, uint8_t*) with a given address
   ///
@@ -300,53 +265,6 @@ class Instrumenter {
   IntegerType *m_intptr_ty = nullptr;
 };
 
-class ValidPointerChecker : public Instrumenter {
-public:
-  ValidPointerChecker(llvm::Module &module,
-                      std::shared_ptr<UtilityFunction> checker_function)
-      : Instrumenter(module, checker_function),
-        m_valid_pointer_check_func(nullptr) {}
-
-  ~ValidPointerChecker() override = default;
-
-protected:
-  bool InstrumentInstruction(llvm::Instruction *inst) override {
-    Log *log = GetLog(LLDBLog::Expressions);
-
-    LLDB_LOGF(log, "Instrumenting load/store instruction: %s\n",
-              PrintValue(inst).c_str());
-
-    if (!m_valid_pointer_check_func)
-      m_valid_pointer_check_func =
-          BuildPointerValidatorFunc(m_checker_function->StartAddress());
-
-    llvm::Value *dereferenced_ptr = nullptr;
-
-    if (llvm::LoadInst *li = dyn_cast<llvm::LoadInst>(inst))
-      dereferenced_ptr = li->getPointerOperand();
-    else if (llvm::StoreInst *si = dyn_cast<llvm::StoreInst>(inst))
-      dereferenced_ptr = si->getPointerOperand();
-    else
-      return false;
-
-    // Insert an instruction to call the helper with the result
-    CallInst::Create(m_valid_pointer_check_func, dereferenced_ptr, "",
-                     inst->getIterator());
-
-    return true;
-  }
-
-  bool InspectInstruction(llvm::Instruction &i) override {
-    if (isa<llvm::LoadInst>(&i) || isa<llvm::StoreInst>(&i))
-      RegisterInstruction(i);
-
-    return true;
-  }
-
-private:
-  llvm::FunctionCallee m_valid_pointer_check_func;
-};
-
 class ObjcObjectChecker : public Instrumenter {
 public:
   ObjcObjectChecker(llvm::Module &module,
@@ -527,16 +445,6 @@ bool IRDynamicChecks::runOnModule(llvm::Module &M) {
     return false;
   }
 
-  if (m_checker_functions.m_valid_pointer_check) {
-    ValidPointerChecker vpc(M, m_checker_functions.m_valid_pointer_check);
-
-    if (!vpc.Inspect(*function))
-      return false;
-
-    if (!vpc.Instrument())
-      return false;
-  }
-
   if (m_checker_functions.m_objc_object_check) {
     ObjcObjectChecker ooc(M, m_checker_functions.m_objc_object_check);
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
index ff20c1f08be0c..f67229afc2152 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
@@ -53,7 +53,6 @@ class ClangDynamicCheckerFunctions
 
   bool DoCheckersExplainStop(lldb::addr_t addr, Stream &message) override;
 
-  std::shared_ptr<UtilityFunction> m_valid_pointer_check;
   std::shared_ptr<UtilityFunction> m_objc_object_check;
 };
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
index fc17b76804d9f..bf8c393445908 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
@@ -116,15 +116,7 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-  // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -136,13 +128,13 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
   options.SetPrefixToken("L");
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -177,15 +169,7 @@ bool lldb_private::formatters::WCharSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-    // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -199,13 +183,13 @@ bool lldb_private::formatters::WCharSummaryProvider(
   options.SetBinaryZeroIsTerminator(false);
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -214,3 +198,73 @@ bool lldb_private::formatters::WCharSummaryProvider(
   }
   return true;
 }
+
+std::optional<uint64_t>
+lldb_private::formatters::GetWCharByteSize(ValueObject &valobj) {
+  return llvm::expectedToOptional(
+      valobj.GetCompilerType()
+          .GetBasicTypeFromAST(lldb::eBasicTypeWChar)
+          .GetByteSize(nullptr));
+}
+
+template <StringPrinter::StringElementType element_type>
+bool lldb_private::formatters::StringBufferSummaryProvider(
+    Stream &stream, const TypeSummaryOptions &summary_options,
+    lldb::ValueObjectSP location_sp, uint64_t size, std::string prefix_token) {
+
+  if (size == 0) {
+    stream.PutCString(prefix_token);
+    stream.PutCString("\"\"");
+    return true;
+  }
+
+  if (!location_sp)
+    return false;
+
+  StringPrinter::ReadBufferAndDumpToStreamOptions options(*location_sp);
+
+  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
+    const auto max_size =
+        location_sp->GetTargetSP()->GetMaximumSizeOfStringSummary();
+    if (size > max_size) {
+      size = max_size;
+      options.SetIsTruncated(true);
+    }
+  }
+
+  {
+    DataExtractor extractor;
+    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
+    if (bytes_read < size)
+      return false;
+
+    options.SetData(std::move(extractor));
+  }
+  options.SetStream(&stream);
+  if (prefix_token.empty())
+    options.SetPrefixToken(nullptr);
+  else
+    options.SetPrefixToken(prefix_token);
+  options.SetQuote('"');
+  options.SetSourceSize(size);
+  options.SetBinaryZeroIsTerminator(false);
+  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+}
+
+// explicit instantiations for all string element types
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::ASCII>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF8>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF16>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF32>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
index a2b606d28cac1..337dcf2fefdcf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 #define LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 
+#include "lldb/DataFormatters/StringPrinter.h"
 #include "lldb/DataFormatters/TypeSummary.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/ValueObject/ValueObject.h"
@@ -43,6 +44,34 @@ bool Char32SummaryProvider(ValueObject &valobj, Stream &stream,
 bool WCharSummaryProvider(ValueObject &valobj, Stream &stream,
                           const TypeSummaryOptions &options); // wchar_t
 
+std::optional<uint64_t> GetWCharByteSize(ValueObject &valobj);
+
+/// Print a summary for a string buffer to \a stream.
+///
+/// \param[in] stream
+///     The output stream to print the summary to.
+///
+/// \param[in] summary_options
+///     Options for printing the string contents. This function respects the
+///     capping.
+///
+/// \param[in] location_sp
+///     ValueObject of a pointer to the string being printed.
+///
+/// \param[in] size
+///     The size of the buffer pointed to by \a location_sp.
+///
+/// \param[in] prefix_token
+///     A prefix before the double quotes (e.g. 'u' results in u"...").
+///
+/// \return
+///     Returns whether the string buffer was successfully printed.
+template <StringPrinter::StringElementType element_type>
+bool StringBufferSummaryProvider(Stream &stream,
+                                 const TypeSummaryOptions &summary_options,
+                                 lldb::ValueObjectSP location_sp, uint64_t size,
+                                 std::string prefix_token);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 358cf7d78fa21..7143089209dd3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -24,6 +24,7 @@
 #include "lldb/ValueObject/ValueObject.h"
 #include "lldb/ValueObject/ValueObjectConstResult.h"
 
+#include "Plugins/Language/CPlusPlus/CxxStringTypes.h"
 #include "Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/lldb-enumerations.h"
@@ -535,70 +536,6 @@ ExtractLibcxxStringInfo(ValueObject &valobj) {
   return std::make_pair(size, location_sp);
 }
 
-static bool
-LibcxxWStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                             const TypeSummaryOptions &summary_options,
-                             ValueObjectSP location_sp, size_t size) {
-  if (size == 0) {
-    stream.Printf("L\"\"");
-    return true;
-  }
-  if (!location_sp)
-    return false;
-
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
-  }
-
-  DataExtractor extractor;
-  const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-  if (bytes_read < size)
-    return false;
-
-  // std::wstring::size() is measured in 'characters', not bytes
-  TypeSystemClangSP scratch_ts_sp =
-      ScratchTypeSystemClang::GetForTarget(*valobj.GetTargetSP());
-  if (!scratch_ts_sp)
-    return false;
-
-  auto wchar_t_size =
-      scratch_ts_sp->GetBasicType(lldb::eBasicTypeWChar).GetByteSize(nullptr);
-  if (!wchar_t_size)
-    return false;
-
-  options.SetData(std::move(extractor));
-  options.SetStream(&stream);
-  options.SetPrefixToken("L");
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-
-  switch (*wchar_t_size) {
-  case 1:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF8>(
-        options);
-    break;
-
-  case 2:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF16>(
-        options);
-    break;
-
-  case 4:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF32>(
-        options);
-  }
-  return false;
-}
-
 bool lldb_private::formatters::LibcxxWStringSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &summary_options) {
@@ -609,52 +546,22 @@ bool lldb_private::formatters::LibcxxWStringSummaryProvider(
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        location_sp, size);
-}
-
-template <StringPrinter::StringElementType element_type>
-static bool
-LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                            const TypeSummaryOptions &summary_options,
-                            std::string prefix_token, ValueObjectSP location_sp,
-                            uint64_t size) {
-
-  if (size == 0) {
-    stream.Printf("\"\"");
-    return true;
-  }
-
-  if (!location_sp)
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
     return false;
 
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
-  }
-
-  {
-    DataExtractor extractor;
-    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-    if (bytes_read < size)
-      return false;
-
-    options.SetData(std::move(extractor));
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, location_sp, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, location_sp, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, location_sp, size, "L");
   }
-  options.SetStream(&stream);
-  if (prefix_token.empty())
-    options.SetPrefixToken(nullptr);
-  else
-    options.SetPrefixToken(prefix_token);
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+  return false;
 }
 
 template <StringPrinter::StringElementType element_type>
@@ -669,8 +576,8 @@ LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, location_sp, size);
+  return StringBufferSummaryProvider<element_type>(
+      stream, summary_options, location_sp, size, prefix_token);
 }
 template <StringPrinter::StringElementType element_type>
 static bool formatStringImpl(ValueObject &valobj, Stream &stream,
@@ -742,8 +649,8 @@ static bool formatStringViewImpl(ValueObject &valobj, Stream &stream,
     return true;
   }
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, dataobj, size);
+  return StringBufferSummaryProvider<element_type>(stream, summary_options,
+                                                   dataobj, size, prefix_token);
 }
 
 bool lldb_private::formatters::LibcxxStringViewSummaryProviderASCII(
@@ -781,8 +688,22 @@ bool lldb_private::formatters::LibcxxWStringViewSummaryProvider(
     return true;
   }
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        dataobj, size);
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
+    return false;
+
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, dataobj, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, dataobj, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, dataobj, size, "L");
+  }
+  return false;
 }
 
 static bool
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 642723dd91132..ffc33395830bb 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -99,14 +99,20 @@ static bool isUnorderedMap(ConstString type_name) {
 
 CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     GetElementType(CompilerType table_type) {
-  auto element_type = table_type.GetTypedefedType().GetTypeTemplateArgument(0);
+  auto element_type =
+      table_type.GetDirectNestedTypeWithName("value_type").GetTypedefedType();
+
+  // In newer unordered_map layouts, the std::pair element type isn't wrapped
+  // in any helper types. So return it directly.
+  if (isStdTemplate(element_type.GetTypeName(), "pair"))
+    return element_type;
 
   // This synthetic provider is used for both unordered_(multi)map and
-  // unordered_(multi)set. For unordered_map, the element type has an
-  // additional type layer, an internal struct (`__hash_value_type`)
-  // that wraps a std::pair. Peel away the internal wrapper type - whose
-  // structure is of no value to users, to expose the std::pair. This
-  // matches the structure returned by the std::map synthetic provider.
+  // unordered_(multi)set. For older unordered_map layouts, the element type has
+  // an additional type layer, an internal struct (`__hash_value_type`) that
+  // wraps a std::pair. Peel away the internal wrapper type - whose structure is
+  // of no value to users, to expose the std::pair. This matches the structure
+  // returned by the std::map synthetic provider.
   if (isUnorderedMap(
           m_backend.GetCompilerType().GetCanonicalType().GetTypeName())) {
     std::string name;
diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 349b3a9d5e619..2602b338eb2c5 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -275,68 +275,100 @@ AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
-  if (sym_type == llvm::object::SymbolRef::ST_Function)
+static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  switch (sym_type) {
+  case llvm::object::SymbolRef::ST_Function:
     return lldb::eSymbolTypeCode;
-  else if (sym_type == llvm::object::SymbolRef::ST_Data)
+  case llvm::object::SymbolRef::ST_Data:
     return lldb::eSymbolTypeData;
-  else if (sym_type == llvm::object::SymbolRef::ST_File)
+  case llvm::object::SymbolRef::ST_File:
     return lldb::eSymbolTypeSourceFile;
-  return lldb::eSymbolTypeInvalid;
+  default:
+    return lldb::eSymbolTypeInvalid;
+  }
 }
 
 void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  Log *log = GetLog(LLDBLog::Object);
   SectionList *sectionList = GetSectionList();
 
   for (const auto &symbol_ref : m_binary->symbols()) {
     llvm::object::XCOFFSymbolRef xcoff_sym_ref(symbol_ref);
+
     llvm::Expected<llvm::StringRef> name_or_err = xcoff_sym_ref.getName();
     if (!name_or_err) {
-      consumeError(name_or_err.takeError());
+      LLDB_LOG_ERROR(log, name_or_err.takeError(),
+                     "Unable to extract name from the xcoff symbol ref object");
       continue;
     }
+
     llvm::StringRef symbolName = name_or_err.get();
-    // Remove the dot prefix for demangle
-    llvm::StringRef symbol_name =
+    // Remove the . prefix added during compilation. This prefix is usually
+    // added to differentiate between reference to the code and function
+    // descriptor. For instance, Adding .func will only allow user to put bp on
+    // .func, which is not known to the user, instead of func.
+    llvm::StringRef name_no_dot =
         symbolName.starts_with(".") ? symbolName.drop_front() : symbolName;
     auto storageClass = xcoff_sym_ref.getStorageClass();
+    // C_HIDEXT symbols are not needed to be exposed, with the exception of TOC
+    // which is responsible for storing references to global data
     if (storageClass == XCOFF::C_HIDEXT && symbolName != "TOC") {
+
+      // Zero or muliple aux entries may suggest ambiguous data
       if (xcoff_sym_ref.getNumberOfAuxEntries() != 1)
         continue;
+
       auto aux_csect_or_err = xcoff_sym_ref.getXCOFFCsectAuxRef();
       if (!aux_csect_or_err) {
-        consumeError(aux_csect_or_err.takeError());
+        LLDB_LOG_ERROR(log, aux_csect_or_err.takeError(),
+                       "Unable to access xcoff csect aux ref object");
         continue;
       }
+
       const llvm::object::XCOFFCsectAuxRef csect_aux = aux_csect_or_err.get();
-      if (csect_aux.getStorageMappingClass() != XCOFF::XMC_PR ||
-          (m_binary->is64Bit() ? (csect_aux.getAuxType64() != XCOFF::AUX_CSECT)
-                               : false))
+
+      // Only add hidden ext entries which come under Program Code, skip others
+      // as they are not useful as debugging data.
+      if (csect_aux.getStorageMappingClass() != XCOFF::XMC_PR)
         continue;
+
+      // This does not apply to 32-bit,
+      // Only add csect symbols identified by the aux entry, as they are
+      // needed to reference section information. Skip others
+      if (m_binary->is64Bit())
+        if (csect_aux.getAuxType64() != XCOFF::AUX_CSECT)
+          continue;
     }
 
     Symbol symbol;
-    symbol.GetMangled().SetValue(ConstString(symbol_name));
+    symbol.GetMangled().SetValue(ConstString(name_no_dot));
 
     int16_t sectionNumber = xcoff_sym_ref.getSectionNumber();
+    // Note that XCOFF section headers are numbered from 1 and not 0.
     size_t sectionIndex = static_cast<size_t>(sectionNumber - 1);
-    if (sectionNumber > 0 && sectionIndex < sectionList->GetSize()) {
-      lldb::SectionSP section_sp =
-          sectionList->GetSectionAtIndex(sectionNumber - 1);
-      if (!section_sp || section_sp->GetFileAddress() == LLDB_INVALID_ADDRESS)
-        continue;
-      lldb::addr_t file_addr = section_sp->GetFileAddress();
-      lldb::addr_t symbolValue = xcoff_sym_ref.getValue();
-      if (symbolValue < file_addr)
-        continue;
-      symbol.GetAddressRef() = Address(section_sp, symbolValue - file_addr);
+    if (sectionNumber > 0) {
+      if (sectionIndex < sectionList->GetSize()) {
+
+        lldb::SectionSP section_sp =
+            sectionList->GetSectionAtIndex(sectionIndex);
+        if (!section_sp || section_sp->GetFileAddress() == LLDB_INVALID_ADDRESS)
+          continue;
+
+        lldb::addr_t file_addr = section_sp->GetFileAddress();
+        lldb::addr_t symbolValue = xcoff_sym_ref.getValue();
+        if (symbolValue < file_addr)
+          continue;
+
+        symbol.GetAddressRef() = Address(section_sp, symbolValue - file_addr);
+      }
     }
 
     Expected<llvm::object::SymbolRef::Type> sym_type_or_err =
         symbol_ref.getType();
     if (!sym_type_or_err) {
-        consumeError(sym_type_or_err.takeError());
-        continue;
+      LLDB_LOG_ERROR(log, sym_type_or_err.takeError(),
+                     "Unable to access xcoff symbol type");
+      continue;
     }
 
     symbol.SetType(MapSymbolType(sym_type_or_err.get()));
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index 94c0a5f11e435..ef691b77193ce 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -20,8 +20,8 @@
 #include <algorithm>
 #include <map>
 #include <optional>
-#include <vector>
 #include <utility>
+#include <vector>
 
 using namespace lldb_private;
 using namespace minidump;
@@ -75,8 +75,7 @@ UUID MinidumpParser::GetModuleUUID(const minidump::Module *module) {
     if (GetArchitecture().GetTriple().isOSBinFormatELF()) {
       if (pdb70_uuid->Age != 0)
         return UUID(pdb70_uuid, sizeof(*pdb70_uuid));
-      return UUID(&pdb70_uuid->Uuid,
-                                    sizeof(pdb70_uuid->Uuid));
+      return UUID(&pdb70_uuid->Uuid, sizeof(pdb70_uuid->Uuid));
     }
     return UUID(*pdb70_uuid);
   } else if (cv_signature == CvSignature::ElfBuildId)
@@ -429,62 +428,65 @@ MinidumpParser::GetExceptionStreams() {
 
 std::optional<minidump::Range>
 MinidumpParser::FindMemoryRange(lldb::addr_t addr) {
-  Log *log = GetLog(LLDBLog::Modules);
+  if (m_memory_ranges.IsEmpty())
+    PopulateMemoryRanges();
+
+  const MemoryRangeVector::Entry *entry =
+      m_memory_ranges.FindEntryThatContains(addr);
+  if (!entry)
+    return std::nullopt;
 
+  return entry->data;
+}
+
+void MinidumpParser::PopulateMemoryRanges() {
+  Log *log = GetLog(LLDBLog::Modules);
   auto ExpectedMemory = GetMinidumpFile().getMemoryList();
-  if (!ExpectedMemory) {
-    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
-                   "Failed to read memory list: {0}");
-  } else {
+  if (ExpectedMemory) {
     for (const auto &memory_desc : *ExpectedMemory) {
       const LocationDescriptor &loc_desc = memory_desc.Memory;
       const lldb::addr_t range_start = memory_desc.StartOfMemoryRange;
       const size_t range_size = loc_desc.DataSize;
-
-      if (loc_desc.RVA + loc_desc.DataSize > GetData().size())
-        return std::nullopt;
-
-      if (range_start <= addr && addr < range_start + range_size) {
-        auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
-        if (!ExpectedSlice) {
-          LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
-                         "Failed to get memory slice: {0}");
-          return std::nullopt;
-        }
-        return minidump::Range(range_start, *ExpectedSlice);
+      auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
+      if (!ExpectedSlice) {
+        LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
+                       "Failed to get memory slice: {0}");
+        continue;
       }
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          range_start, range_size,
+          minidump::Range(range_start, *ExpectedSlice)));
     }
+  } else {
+    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
+                   "Failed to read memory list: {0}");
   }
 
   if (!GetStream(StreamType::Memory64List).empty()) {
     llvm::Error err = llvm::Error::success();
-    for (const auto &memory_desc :  GetMinidumpFile().getMemory64List(err)) {
-      if (memory_desc.first.StartOfMemoryRange <= addr 
-          && addr < memory_desc.first.StartOfMemoryRange + memory_desc.first.DataSize) {
-        return minidump::Range(memory_desc.first.StartOfMemoryRange, memory_desc.second);
-      }
+    for (const auto &memory_desc : GetMinidumpFile().getMemory64List(err)) {
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          memory_desc.first.StartOfMemoryRange, memory_desc.first.DataSize,
+          minidump::Range(memory_desc.first.StartOfMemoryRange,
+                          memory_desc.second)));
     }
 
     if (err)
       LLDB_LOG_ERROR(log, std::move(err), "Failed to read memory64 list: {0}");
   }
 
-  return std::nullopt;
+  m_memory_ranges.Sort();
 }
 
 llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
                                                   size_t size) {
-  // I don't have a sense of how frequently this is called or how many memory
-  // ranges a Minidump typically has, so I'm not sure if searching for the
-  // appropriate range linearly each time is stupid.  Perhaps we should build
-  // an index for faster lookups.
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
     return {};
 
   // There's at least some overlap between the beginning of the desired range
-  // (addr) and the current range.  Figure out where the overlap begins and how
-  // much overlap there is.
+  // (addr) and the current range.  Figure out where the overlap begins and
+  // how much overlap there is.
 
   const size_t offset = addr - range->start;
 
@@ -495,7 +497,8 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   return range->range_ref.slice(offset, overlap);
 }
 
-llvm::iterator_range<FallibleMemory64Iterator> MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
+llvm::iterator_range<FallibleMemory64Iterator>
+MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
   llvm::ErrorAsOutParameter ErrAsOutParam(&err);
   return m_file->getMemory64List(err);
 }
@@ -607,8 +610,7 @@ std::pair<MemoryRegionInfos, bool> MinidumpParser::BuildMemoryRegions() {
   case StreamType::ST:                                                         \
     return #ST
 
-llvm::StringRef
-MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
+llvm::StringRef MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
   switch (stream_type) {
     ENUM_TO_CSTR(Unused);
     ENUM_TO_CSTR(ThreadList);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 2c5e6f19ff9a1..14599f8d572aa 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -17,6 +17,7 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
 
+#include "lldb/Utility/RangeMap.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -35,6 +36,9 @@ namespace minidump {
 
 // Describes a range of memory captured in the Minidump
 struct Range {
+  // Default constructor required for range data vector
+  // but unusued.
+  Range() = default;
   lldb::addr_t start; // virtual address of the beginning of the range
   // range_ref - absolute pointer to the first byte of the range and size
   llvm::ArrayRef<uint8_t> range_ref;
@@ -45,9 +49,18 @@ struct Range {
   friend bool operator==(const Range &lhs, const Range &rhs) {
     return lhs.start == rhs.start && lhs.range_ref == rhs.range_ref;
   }
+
+  friend bool operator<(const Range &lhs, const Range &rhs) {
+    if (lhs.start == rhs.start)
+      return lhs.range_ref.size() < rhs.range_ref.size();
+    return lhs.start < rhs.start;
+  }
 };
 
-using FallibleMemory64Iterator = llvm::object::MinidumpFile::FallibleMemory64Iterator;
+using MemoryRangeVector =
+    lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, minidump::Range>;
+using FallibleMemory64Iterator =
+    llvm::object::MinidumpFile::FallibleMemory64Iterator;
 using ExceptionStreamsIterator =
     llvm::object::MinidumpFile::ExceptionStreamsIterator;
 
@@ -97,7 +110,8 @@ class MinidumpParser {
   /// complete (includes all regions mapped into the process memory).
   std::pair<MemoryRegionInfos, bool> BuildMemoryRegions();
 
-  llvm::iterator_range<FallibleMemory64Iterator> GetMemory64Iterator(llvm::Error &err);
+  llvm::iterator_range<FallibleMemory64Iterator>
+  GetMemory64Iterator(llvm::Error &err);
 
   static llvm::StringRef GetStreamTypeAsString(StreamType stream_type);
 
@@ -109,10 +123,11 @@ class MinidumpParser {
 private:
   MinidumpParser(lldb::DataBufferSP data_sp,
                  std::unique_ptr<llvm::object::MinidumpFile> file);
-
+  void PopulateMemoryRanges();
   lldb::DataBufferSP m_data_sp;
   std::unique_ptr<llvm::object::MinidumpFile> m_file;
   ArchSpec m_arch;
+  MemoryRangeVector m_memory_ranges;
 };
 
 } // end namespace minidump
diff --git a/lldb/test/API/commands/plugin/TestPlugin.py b/lldb/test/API/commands/plugin/TestPlugin.py
new file mode 100644
index 0000000000000..fdfb14bfcc24e
--- /dev/null
+++ b/lldb/test/API/commands/plugin/TestPlugin.py
@@ -0,0 +1,62 @@
+"""
+Make sure the plugin list, enable, and disable commands work.
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestFrameVar(TestBase):
+    # If your test case doesn't stress debug info, then
+    # set this to true.  That way it won't be run once for
+    # each debug info format.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_plugin_list_enable_disable_commands(self):
+        for plugin_namespace in [
+            "abi",
+            "architecture",
+            "disassembler",
+            "dynamic-loader",
+            "emulate-instruction",
+            "instrumentation-runtime",
+            "jit-loader",
+            "language",
+            "language-runtime",
+            "memory-history",
+            "object-container",
+            "object-file",
+            "operating-system",
+            "platform",
+            "process",
+            "repl",
+            "register-type-builder",
+            "script-interpreter",
+            "scripted-interface",
+            "structured-data",
+            "symbol-file",
+            "symbol-locator",
+            "symbol-vendor",
+            "system-runtime",
+            # 'trace', # No trace plugin is registered by default.
+            "trace-exporter",
+            "type-system",
+            "unwind-assembly",
+        ]:
+            self.do_list_disable_enable_test(plugin_namespace)
+
+    def do_list_disable_enable_test(self, plugin_namespace):
+        # Plugins are enabled by default.
+        self.expect(
+            f"plugin list {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
+
+        # Plugins can be disabled.
+        self.expect(
+            f"plugin disable {plugin_namespace}", substrs=[plugin_namespace, "[-]"]
+        )
+
+        # Plugins can be enabled.
+        self.expect(
+            f"plugin enable {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
index 5c5cf4ca16b98..32764629d65a7 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -65,11 +65,9 @@ def cleanup():
                 '(%s::wstring) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"'
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
-                # FIXME: This should have a 'u' prefix.
-                '(%s::u16string) u16_empty = ""' % ns,
+                '(%s::u16string) u16_empty = u""' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                # FIXME: This should have a 'U' prefix.
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
@@ -123,7 +121,7 @@ def cleanup():
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
index f8fc8ae66405b..3883395f23924 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
@@ -81,11 +81,11 @@ def cleanup():
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
         self.expect_var_path(
             "oops", type="std::string_view", summary='"Hellooo World\\n"'
         )
@@ -145,11 +145,11 @@ def cleanup():
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
 
         self.runCmd("cont")
         self.expect(
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 646a446c86fdb..83713213ce1fe 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -349,3 +349,10 @@ def delete_module_cache(path):
     for v in ["SystemDrive"]:
         if v in os.environ:
             config.environment[v] = os.environ[v]
+
+# Some steps required to initialize the tests dynamically link with python.dll
+# and need to know the location of the Python libraries. This ensures that we
+# use the same version of Python that was used to build lldb to run our tests.
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8552d17d66631..86d58889cc4ad 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -20,6 +20,7 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@"
 config.python_executable = "@LLDB_PYTHON_API_TEST_EXECUTABLE@"
+config.python_root_dir = "@Python3_ROOT_DIR@"
 config.lua_executable = "@LUA_EXECUTABLE@"
 config.lldb_lua_cpath = "@LLDB_LUA_CPATH@"
 config.lua_test_entry = "TestLuaAPI.py"
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
index 4dc8c5b3c7ded..4ca733a9a59ca 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
@@ -1,16 +1,12 @@
 """
-Test lldb-dap setBreakpoints request
+Test lldb-dap setExceptionBreakpoints request
 """
 
-
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
 import lldbdap_testcase
 
 
-@skip("Temporarily disable the breakpoint tests")
 class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     def test_functionality(self):
@@ -33,8 +29,9 @@ def test_functionality(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
 
-        filters = ["cpp_throw", "cpp_catch"]
-        response = self.dap_server.request_setExceptionBreakpoints(filters)
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filters=["cpp_throw", "cpp_catch"],
+        )
         if response:
             self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
index 9b6528337cb9d..17e6dc76699ab 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
@@ -1,6 +1,6 @@
 OBJC_SOURCES := main.m
 
-CFLAGS_EXTRAS := -w
+CFLAGS_EXTRAS := -w -fobjc-exceptions
 
 USE_SYSTEM_STDLIB := 1
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
index 777d55f48e850..ddedf7a6de8c6 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
@@ -2,7 +2,6 @@
 Test exception behavior in DAP with obj-c throw.
 """
 
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
@@ -25,3 +24,41 @@ def test_stopped_description(self):
         exception_details = exception_info["details"]
         self.assertRegex(exception_details["message"], "SomeReason")
         self.assertRegex(exception_details["stackTrace"], "main.m")
+
+    @skipUnlessDarwin
+    def test_break_on_throw_and_catch(self):
+        """
+        Test that breakpoints on exceptions work as expected.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filter_options=[
+                {
+                    "filterId": "objc_throw",
+                    "condition": '[[((NSException *)$arg1) name] isEqual:@"ThrownException"]',
+                },
+            ]
+        )
+        if response:
+            self.assertTrue(response["success"])
+
+        self.continue_to_exception_breakpoint("Objective-C Throw")
+
+        # FIXME: Catching objc exceptions do not appear to be working.
+        # Xcode appears to set a breakpoint on '__cxa_begin_catch' for objc
+        # catch, which is different than
+        # SBTarget::BreakpointCreateForException(eLanguageObjectiveC, /*catch_bp=*/true, /*throw_bp=*/false);
+        # self.continue_to_exception_breakpoint("Objective-C Catch")
+
+        self.do_continue()
+
+        self.assertTrue(self.verify_stop_exception_info("signal SIGABRT"))
+        exception_info = self.get_exceptionInfo()
+        self.assertEqual(exception_info["breakMode"], "always")
+        self.assertEqual(exception_info["description"], "signal SIGABRT")
+        self.assertEqual(exception_info["exceptionId"], "signal")
+        exception_details = exception_info["details"]
+        self.assertRegex(exception_details["message"], "SomeReason")
+        self.assertRegex(exception_details["stackTrace"], "main.m")
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/main.m b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
index e8db04fb40de1..bbfa621992799 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/main.m
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
@@ -1,8 +1,14 @@
 #import <Foundation/Foundation.h>
 
 int main(int argc, char const *argv[]) {
-  @throw [[NSException alloc] initWithName:@"ThrownException"
-                                    reason:@"SomeReason"
-                                  userInfo:nil];
+  @try {
+    NSException *e = [[NSException alloc] initWithName:@"ThrownException"
+                                      reason:@"SomeReason"
+                                    userInfo:nil];
+    @throw e;
+  } @catch (NSException *e) {
+    NSLog(@"Caught %@", e);
+    @throw; // let the process crash...
+  }
   return 0;
 }
diff --git a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
index 4045dd8fb6569..77c1e47914a39 100644
--- a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
+++ b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
@@ -32,13 +32,7 @@ def test_save_core(self):
         # Getting dap stack trace may trigger __lldb_caller_function JIT module to be created.
         self.get_stackFrames(startFrame=0)
 
-        # Evaluating an expression that cause "_$__lldb_valid_pointer_check" JIT module to be created.
-        expression = 'printf("this is a test")'
-        self.dap_server.request_evaluate(expression, context="watch")
-
-        # Verify "_$__lldb_valid_pointer_check" JIT module is created.
         modules = self.dap_server.get_modules()
-        self.assertTrue(modules["_$__lldb_valid_pointer_check"])
         thread_count = len(self.dap_server.get_threads())
 
         core_stack = self.getBuildArtifact("core.stack.dmp")
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
index 07acfe07c9ffc..03b79a805d341 100644
--- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -78,3 +78,53 @@ def test_basic(self):
         leaf_frame = self.dap_server.get_stackFrame()
         self.assertIsNotNone(leaf_frame, "expect a leaf frame")
         self.assertEqual(step_in_targets[1]["label"], leaf_frame["name"])
+
+    @skipIf(archs=no_match(["x86", "x86_64"]))
+    def test_supported_capability_x86_arch(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        self.assertEqual(
+            is_supported,
+            True,
+            f"expect capability `stepInTarget` is supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
+
+    @skipIf(archs=["x86", "x86_64"])
+    def test_supported_capability_other_archs(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+
+        try:
+            is_supported = self.dap_server.get_capability(
+                "supportsStepInTargetsRequest"
+            )
+        except dap_server.NotSupportedError:
+            is_supported = False
+
+        self.assertEqual(
+            is_supported,
+            False,
+            f"expect capability `stepInTarget` is not supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
diff --git a/lldb/test/Shell/Commands/command-plugin-list.test b/lldb/test/Shell/Commands/command-plugin-list.test
index 9d3680d48cdd0..3f02157665bb2 100644
--- a/lldb/test/Shell/Commands/command-plugin-list.test
+++ b/lldb/test/Shell/Commands/command-plugin-list.test
@@ -10,10 +10,10 @@
 # Test plugin list without an argument will list all plugins.
 plugin list
 # CHECK-LABEL: plugin list
-# CHECK: system-runtime
-# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
-# CHECK: instrumentation-runtime
-# CHECK:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: instrumentation-runtime
+# CHECK-DAG:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: system-runtime
+# CHECK-DAG:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
 
 # Test plugin list works with fully qualified name.
 plugin list system-runtime.systemruntime-macosx
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
new file mode 100644
index 0000000000000..6b1a40a283445
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
@@ -0,0 +1,121 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000100000500                    0x0000000000000398 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000110000a70                    0x0000000000000060 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x0000000110000ad0                    0x00000000000000b0 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000100000898                    0x00000000100001d8 0x00000000 text
+# CHECK: [    5] 4294967295     Code            0x0000000100000898                    0x00000000100001d8 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1F7
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x0002
+Sections:
+  - Name:            .text
+    Address:         0x100000438
+    Size:            0x38
+    FileOffsetToData: 0x0
+    FileOffsetToLineNumbers: 0x0
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     E8C20000
+  - Name:            .data
+    Address:         0x1100008D2
+    Size:            0x2AE
+    FileOffsetToData: 0x8D2
+    FileOffsetToRelocations: 0x132E
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x22
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     '' 
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 0
+        StorageMappingClass: XMC_RW
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .__threads_init
+    Value:           0x100000500
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 80
+        SectionOrLengthHi: 0
+  - Name:            __threads_init
+    Value:           0x110000A70
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_DS
+        SectionOrLengthLo: 24
+        SectionOrLengthHi: 0
+  - Name:            TOC
+    Value:           0x110000AD0
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_TC0
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .text
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 17
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 58
+        SectionOrLengthHi: 0
+  - Name:            .main
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 135
+        SectionOrLengthHi: 0
+...
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
new file mode 100644
index 0000000000000..59c018ba0e426
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
@@ -0,0 +1,124 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000010000320                    0x0000000000000420 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000020000920                    0x000000000000003c 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x000000002000095c                    0x0000000000000060 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 text
+# CHECK: [    5] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1DF
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x1002
+Sections:
+  - Name:            .text
+    Address:         0x10000268
+    Size:            0x512
+    FileOffsetToData: 0x268
+    FileOffsetToRelocations: 0xECC
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x24
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     80C20000
+  - Name:            .data
+    Address:         0x2000077A
+    Size:            0x242
+    FileOffsetToData: 0x77A
+    FileOffsetToRelocations: 0x1034
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x25
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     ''
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_RW
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .__threads_init
+    Value:           0x10000320
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 84
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            __threads_init
+    Value:           0x20000920
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_DS
+        SectionOrLength: 12
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            TOC
+    Value:           0x2000095C
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_TC0
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .text
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 58
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .main
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 137
+        StabInfoIndex:   0
+        StabSectNum:     0
+
+...
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index ab6113767187a..8c9448b23c56b 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -203,3 +203,6 @@ def calculate_arch_features(arch_string):
 # location of the Python libraries. This ensures that we use the same
 # version of Python that was used to build lldb to run our tests.
 config.environment["PYTHONHOME"] = config.python_root_dir
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 391d1c50168ea..8be384c6d24af 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -1476,7 +1476,6 @@ bool RNBRemote::InitializeRegisters(bool force) {
 
 void RNBRemote::NotifyThatProcessStopped(void) {
   RNBRemote::HandlePacket_last_signal(NULL);
-  return;
 }
 
 /* 'A arglen,argnum,arg,...'
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 9fe8227cd2d6f..c171b55951cb5 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -9,6 +9,7 @@
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
+#include "ExceptionBreakpoint.h"
 #include "Handler/RequestHandler.h"
 #include "Handler/ResponseHandler.h"
 #include "JSONUtils.h"
@@ -17,6 +18,7 @@
 #include "Protocol/ProtocolBase.h"
 #include "Protocol/ProtocolRequests.h"
 #include "Protocol/ProtocolTypes.h"
+#include "ProtocolUtils.h"
 #include "Transport.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBCommandInterpreter.h"
@@ -129,93 +131,81 @@ DAP::DAP(Log *log, const ReplMode default_repl_mode,
 DAP::~DAP() = default;
 
 void DAP::PopulateExceptionBreakpoints() {
-  llvm::call_once(init_exception_breakpoints_flag, [this]() {
-    exception_breakpoints = std::vector<ExceptionBreakpoint>{};
-
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
-      exception_breakpoints->emplace_back(*this, "cpp_catch", "C++ Catch",
-                                          lldb::eLanguageTypeC_plus_plus);
-      exception_breakpoints->emplace_back(*this, "cpp_throw", "C++ Throw",
-                                          lldb::eLanguageTypeC_plus_plus);
-    }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
-      exception_breakpoints->emplace_back(
-          *this, "objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC);
-      exception_breakpoints->emplace_back(
-          *this, "objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC);
-    }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
-      exception_breakpoints->emplace_back(*this, "swift_catch", "Swift Catch",
-                                          lldb::eLanguageTypeSwift);
-      exception_breakpoints->emplace_back(*this, "swift_throw", "Swift Throw",
-                                          lldb::eLanguageTypeSwift);
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
+    exception_breakpoints.emplace_back(*this, "cpp_catch", "C++ Catch",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "cpp_throw", "C++ Throw",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindThrow);
+  }
+
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
+    exception_breakpoints.emplace_back(*this, "objc_catch", "Objective-C Catch",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "objc_throw", "Objective-C Throw",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindThrow);
+  }
+
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
+    exception_breakpoints.emplace_back(*this, "swift_catch", "Swift Catch",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "swift_throw", "Swift Throw",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindThrow);
+  }
+
+  // Besides handling the hardcoded list of languages from above, we try to find
+  // any other languages that support exception breakpoints using the SB API.
+  for (int raw_lang = lldb::eLanguageTypeUnknown;
+       raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
+    lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
+
+    // We first discard any languages already handled above.
+    if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
+        lang == lldb::eLanguageTypeSwift)
+      continue;
+
+    if (!lldb::SBDebugger::SupportsLanguage(lang))
+      continue;
+
+    const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
+    if (!name)
+      continue;
+    std::string raw_lang_name = name;
+    std::string capitalized_lang_name = capitalize(name);
+
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
+      const char *raw_throw_keyword =
+          lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
+      std::string throw_keyword =
+          raw_throw_keyword ? raw_throw_keyword : "throw";
+
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + throw_keyword,
+          capitalized_lang_name + " " + capitalize(throw_keyword), lang,
+          eExceptionKindThrow);
     }
-    // Besides handling the hardcoded list of languages from above, we try to
-    // find any other languages that support exception breakpoints using the
-    // SB API.
-    for (int raw_lang = lldb::eLanguageTypeUnknown;
-         raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
-      lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
-
-      // We first discard any languages already handled above.
-      if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
-          lang == lldb::eLanguageTypeSwift)
-        continue;
-
-      if (!lldb::SBDebugger::SupportsLanguage(lang))
-        continue;
-
-      const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
-      if (!name)
-        continue;
-      std::string raw_lang_name = name;
-      std::string capitalized_lang_name = capitalize(name);
-
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
-        const char *raw_throw_keyword =
-            lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
-        std::string throw_keyword =
-            raw_throw_keyword ? raw_throw_keyword : "throw";
-
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + throw_keyword,
-            capitalized_lang_name + " " + capitalize(throw_keyword), lang);
-      }
 
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
-        const char *raw_catch_keyword =
-            lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
-        std::string catch_keyword =
-            raw_catch_keyword ? raw_catch_keyword : "catch";
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
+      const char *raw_catch_keyword =
+          lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
+      std::string catch_keyword =
+          raw_catch_keyword ? raw_catch_keyword : "catch";
 
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + catch_keyword,
-            capitalized_lang_name + " " + capitalize(catch_keyword), lang);
-      }
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + catch_keyword,
+          capitalized_lang_name + " " + capitalize(catch_keyword), lang,
+          eExceptionKindCatch);
     }
-    assert(!exception_breakpoints->empty() && "should not be empty");
-  });
+  }
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
-  // PopulateExceptionBreakpoints() is called after g_dap.debugger is created
-  // in a request-initialize.
-  //
-  // But this GetExceptionBreakpoint() method may be called before attaching, in
-  // which case, we may not have populated the filter yet.
-  //
-  // We also cannot call PopulateExceptionBreakpoints() in DAP::DAP() because
-  // we need SBDebugger::Initialize() to have been called before this.
-  //
-  // So just calling PopulateExceptionBreakoints(),which does lazy-populating
-  // seems easiest. Two other options include:
-  //  + call g_dap.PopulateExceptionBreakpoints() in lldb-dap.cpp::main()
-  //    right after the call to SBDebugger::Initialize()
-  //  + Just call PopulateExceptionBreakpoints() to get a fresh list  everytime
-  //    we query (a bit overkill since it's not likely to change?)
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetFilter() == filter)
       return &bp;
   }
@@ -223,10 +213,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  // See comment in the other GetExceptionBreakpoint().
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetID() == bp_id)
       return &bp;
   }
@@ -1118,8 +1105,9 @@ protocol::Capabilities DAP::GetCapabilities() {
   }
 
   // Available filters or options for the setExceptionBreakpoints request.
+  PopulateExceptionBreakpoints();
   std::vector<protocol::ExceptionBreakpointsFilter> filters;
-  for (const auto &exc_bp : *exception_breakpoints)
+  for (const auto &exc_bp : exception_breakpoints)
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   capabilities.exceptionBreakpointFilters = std::move(filters);
 
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 89bc827c1141f..5ca5822f9bced 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -99,7 +99,7 @@ struct DAP {
   lldb::SBBroadcaster broadcaster;
   FunctionBreakpointMap function_breakpoints;
   InstructionBreakpointMap instruction_breakpoints;
-  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
+  std::vector<ExceptionBreakpoint> exception_breakpoints;
   llvm::once_flag init_exception_breakpoints_flag;
 
   /// Map step in target id to list of function targets that user can choose.
@@ -320,7 +320,7 @@ struct DAP {
     });
   }
 
-  /// The set of capablities supported by this adapter.
+  /// The set of capabilities supported by this adapter.
   protocol::Capabilities GetCapabilities();
 
   /// Debuggee will continue from stopped state.
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index 9641f29698b10..364cc7ab4ef8c 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -44,6 +44,11 @@ void SendTargetBasedCapabilities(DAP &dap) {
 
   protocol::CapabilitiesEventBody body;
 
+  const llvm::StringRef target_triple = dap.target.GetTriple();
+  if (target_triple.starts_with("x86"))
+    body.capabilities.supportedFeatures.insert(
+        protocol::eAdapterFeatureStepInTargetsRequest);
+
   // We only support restarting launch requests not attach requests.
   if (dap.last_launch_request)
     body.capabilities.supportedFeatures.insert(
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
index 9772e7344ced6..5bf06268a5af2 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
@@ -9,23 +9,33 @@
 #include "ExceptionBreakpoint.h"
 #include "BreakpointBase.h"
 #include "DAP.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBMutex.h"
 #include "lldb/API/SBTarget.h"
 #include <mutex>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-void ExceptionBreakpoint::SetBreakpoint() {
+protocol::Breakpoint ExceptionBreakpoint::SetBreakpoint(StringRef condition) {
   lldb::SBMutex lock = m_dap.GetAPIMutex();
   std::lock_guard<lldb::SBMutex> guard(lock);
 
-  if (m_bp.IsValid())
-    return;
-  bool catch_value = m_filter.find("_catch") != std::string::npos;
-  bool throw_value = m_filter.find("_throw") != std::string::npos;
-  m_bp = m_dap.target.BreakpointCreateForException(m_language, catch_value,
-                                                   throw_value);
-  m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  if (!m_bp.IsValid()) {
+    m_bp = m_dap.target.BreakpointCreateForException(
+        m_language, m_kind == eExceptionKindCatch,
+        m_kind == eExceptionKindThrow);
+    m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  }
+
+  m_bp.SetCondition(condition.data());
+
+  protocol::Breakpoint breakpoint;
+  breakpoint.id = m_bp.GetID();
+  breakpoint.verified = m_bp.IsValid();
+  return breakpoint;
 }
 
 void ExceptionBreakpoint::ClearBreakpoint() {
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
index 319b472a89a34..802ec71ce6ad3 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
@@ -10,6 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H
 
 #include "DAPForward.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/StringRef.h"
@@ -18,14 +19,20 @@
 
 namespace lldb_dap {
 
+enum ExceptionKind : unsigned {
+  eExceptionKindCatch,
+  eExceptionKindThrow,
+};
+
 class ExceptionBreakpoint {
 public:
   ExceptionBreakpoint(DAP &d, std::string f, std::string l,
-                      lldb::LanguageType lang)
+                      lldb::LanguageType lang, ExceptionKind kind)
       : m_dap(d), m_filter(std::move(f)), m_label(std::move(l)),
-        m_language(lang), m_bp() {}
+        m_language(lang), m_kind(kind), m_bp() {}
 
-  void SetBreakpoint();
+  protocol::Breakpoint SetBreakpoint() { return SetBreakpoint(""); };
+  protocol::Breakpoint SetBreakpoint(llvm::StringRef condition);
   void ClearBreakpoint();
 
   lldb::break_id_t GetID() const { return m_bp.GetID(); }
@@ -39,6 +46,7 @@ class ExceptionBreakpoint {
   std::string m_filter;
   std::string m_label;
   lldb::LanguageType m_language;
+  ExceptionKind m_kind;
   lldb::SBBreakpoint m_bp;
 };
 
diff --git a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
index d5878d18289d6..85214b84b5c9c 100644
--- a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
@@ -100,7 +100,7 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   const char *m = inst.GetMnemonic(target);
   const char *o = inst.GetOperands(target);
-  const char *c = inst.GetComment(target);
+  std::string c = inst.GetComment(target);
   auto d = inst.GetData(target);
 
   std::string bytes;
@@ -114,34 +114,30 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   DisassembledInstruction disassembled_inst;
   disassembled_inst.address = inst_addr;
-  disassembled_inst.instructionBytes =
-      bytes.size() > 0 ? bytes.substr(0, bytes.size() - 1) : "";
 
-  std::string instruction;
-  llvm::raw_string_ostream si(instruction);
+  if (!bytes.empty()) // remove last whitespace
+    bytes.pop_back();
+  disassembled_inst.instructionBytes = std::move(bytes);
+
+  llvm::raw_string_ostream si(disassembled_inst.instruction);
+  si << llvm::formatv("{0,-7} {1,-25}", m, o);
 
-  lldb::SBSymbol symbol = addr.GetSymbol();
   // Only add the symbol on the first line of the function.
-  if (symbol.IsValid() && symbol.GetStartAddress() == addr) {
-    // If we have a valid symbol, append it as a label prefix for the first
-    // instruction. This is so you can see the start of a function/callsite
-    // in the assembly, at the moment VS Code (1.80) does not visualize the
-    // symbol associated with the assembly instruction.
-    si << (symbol.GetMangledName() != nullptr ? symbol.GetMangledName()
-                                              : symbol.GetName())
-       << ": ";
+  // in the comment section
+  if (lldb::SBSymbol symbol = addr.GetSymbol();
+      symbol.GetStartAddress() == addr) {
+    const llvm::StringRef sym_display_name = symbol.GetDisplayName();
+    c.append(" ");
+    c.append(sym_display_name);
 
     if (resolve_symbols)
-      disassembled_inst.symbol = symbol.GetDisplayName();
+      disassembled_inst.symbol = sym_display_name;
   }
 
-  si << llvm::formatv("{0,7} {1,12}", m, o);
-  if (c && c[0]) {
+  if (!c.empty()) {
     si << " ; " << c;
   }
 
-  disassembled_inst.instruction = std::move(instruction);
-
   protocol::Source source = CreateSource(addr, target);
   lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, addr);
 
diff --git a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
index dcd02d61ca4f4..b499a69876e2c 100644
--- a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
@@ -54,7 +54,6 @@ llvm::Expected<InitializeResponse> InitializeRequestHandler::Run(
   if (llvm::Error err = dap.RunPreInitCommands())
     return err;
 
-  dap.PopulateExceptionBreakpoints();
   auto cmd = dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (arguments.supportedFeatures.contains(
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index d3f231589b54c..054cc7a321316 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -353,14 +353,15 @@ class StepInRequestHandler : public RequestHandler<protocol::StepInArguments,
   llvm::Error Run(const protocol::StepInArguments &args) const override;
 };
 
-class StepInTargetsRequestHandler : public LegacyRequestHandler {
+class StepInTargetsRequestHandler
+    : public RequestHandler<
+          protocol::StepInTargetsArguments,
+          llvm::Expected<protocol::StepInTargetsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "stepInTargets"; }
-  FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureStepInTargetsRequest};
-  }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::StepInTargetsResponseBody>
+  Run(const protocol::StepInTargetsArguments &args) const override;
 };
 
 class StepOutRequestHandler : public RequestHandler<protocol::StepOutArguments,
@@ -386,14 +387,21 @@ class SetBreakpointsRequestHandler
   Run(const protocol::SetBreakpointsArguments &args) const override;
 };
 
-class SetExceptionBreakpointsRequestHandler : public LegacyRequestHandler {
+class SetExceptionBreakpointsRequestHandler
+    : public RequestHandler<
+          protocol::SetExceptionBreakpointsArguments,
+          llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "setExceptionBreakpoints"; }
   FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureExceptionOptions};
+    /// Prefer the `filterOptions` feature over the `exceptionOptions`.
+    /// exceptionOptions is not supported in VSCode, while `filterOptions` is
+    /// supported.
+    return {protocol::eAdapterFeatureExceptionFilterOptions};
   }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>
+  Run(const protocol::SetExceptionBreakpointsArguments &args) const override;
 };
 
 class SetFunctionBreakpointsRequestHandler
diff --git a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
index 2214833f8a770..6a271fb825137 100644
--- a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
@@ -8,86 +8,61 @@
 
 #include "DAP.h"
 #include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include <set>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-// "SetExceptionBreakpointsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "SetExceptionBreakpoints request; value of command field
-//     is 'setExceptionBreakpoints'. The request configures the debuggers
-//     response to thrown exceptions. If an exception is configured to break, a
-//     StoppedEvent is fired (event type 'exception').", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "setExceptionBreakpoints" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/SetExceptionBreakpointsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "SetExceptionBreakpointsArguments": {
-//   "type": "object",
-//   "description": "Arguments for 'setExceptionBreakpoints' request.",
-//   "properties": {
-//     "filters": {
-//       "type": "array",
-//       "items": {
-//         "type": "string"
-//       },
-//       "description": "IDs of checked exception options. The set of IDs is
-//       returned via the 'exceptionBreakpointFilters' capability."
-//     },
-//     "exceptionOptions": {
-//       "type": "array",
-//       "items": {
-//         "$ref": "#/definitions/ExceptionOptions"
-//       },
-//       "description": "Configuration options for selected exceptions."
-//     }
-//   },
-//   "required": [ "filters" ]
-// },
-// "SetExceptionBreakpointsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to 'setExceptionBreakpoints' request. This is
-//     just an acknowledgement, so no body field is required."
-//   }]
-// }
-void SetExceptionBreakpointsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  lldb::SBError error;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-  const auto *filters = arguments->getArray("filters");
+/// The request configures the debugger’s response to thrown exceptions. Each of
+/// the `filters`, `filterOptions`, and `exceptionOptions` in the request are
+/// independent configurations to a debug adapter indicating a kind of exception
+/// to catch. An exception thrown in a program should result in a `stopped`
+/// event from the debug adapter (with reason `exception`) if any of the
+/// configured filters match.
+///
+/// Clients should only call this request if the corresponding capability
+/// `exceptionBreakpointFilters` returns one or more filters.
+Expected<SetExceptionBreakpointsResponseBody>
+SetExceptionBreakpointsRequestHandler::Run(
+    const SetExceptionBreakpointsArguments &arguments) const {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
-  std::set<llvm::StringRef> unset_filters;
-  for (const auto &bp : *dap.exception_breakpoints)
+  std::set<StringRef> unset_filters;
+  for (const auto &bp : dap.exception_breakpoints)
     unset_filters.insert(bp.GetFilter());
 
-  for (const auto &value : *filters) {
-    const auto filter = GetAsString(value);
+  SetExceptionBreakpointsResponseBody body;
+  for (const auto &filter : arguments.filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp) {
-      exc_bp->SetBreakpoint();
-      unset_filters.erase(std::string(filter));
-    }
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint());
+    unset_filters.erase(filter);
+  }
+  for (const auto &filterOptions : arguments.filterOptions) {
+    auto *exc_bp = dap.GetExceptionBreakpoint(filterOptions.filterId);
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint(filterOptions.condition));
+    unset_filters.erase(filterOptions.filterId);
   }
+
+  // Clear any unset filters.
   for (const auto &filter : unset_filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp)
-      exc_bp->ClearBreakpoint();
+    if (!exc_bp)
+      continue;
+
+    exc_bp->ClearBreakpoint();
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  return body;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
index 9b99791599f82..9295b6ceae36d 100644
--- a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
@@ -7,143 +7,85 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
-#include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include "lldb/API/SBInstruction.h"
+#include "lldb/lldb-defines.h"
 
+using namespace lldb_dap::protocol;
 namespace lldb_dap {
 
-// "StepInTargetsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "This request retrieves the possible step-in targets for
-//     the specified stack frame.\nThese targets can be used in the `stepIn`
-//     request.\nClients should only call this request if the corresponding
-//     capability `supportsStepInTargetsRequest` is true.", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "stepInTargets" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/StepInTargetsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "StepInTargetsArguments": {
-//   "type": "object",
-//   "description": "Arguments for `stepInTargets` request.",
-//   "properties": {
-//     "frameId": {
-//       "type": "integer",
-//       "description": "The stack frame for which to retrieve the possible
-//       step-in targets."
-//     }
-//   },
-//   "required": [ "frameId" ]
-// },
-// "StepInTargetsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `stepInTargets` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "targets": {
-//             "type": "array",
-//             "items": {
-//               "$ref": "#/definitions/StepInTarget"
-//             },
-//             "description": "The possible step-in targets of the specified
-//             source location."
-//           }
-//         },
-//         "required": [ "targets" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-void StepInTargetsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-
+// This request retrieves the possible step-in targets for the specified stack
+// frame.
+// These targets can be used in the `stepIn` request.
+// Clients should only call this request if the corresponding capability
+// `supportsStepInTargetsRequest` is true.
+llvm::Expected<StepInTargetsResponseBody>
+StepInTargetsRequestHandler::Run(const StepInTargetsArguments &args) const {
   dap.step_in_targets.clear();
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  if (frame.IsValid()) {
-    lldb::SBAddress pc_addr = frame.GetPCAddress();
-    lldb::SBAddress line_end_addr =
-        pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
-    lldb::SBInstructionList insts = dap.target.ReadInstructions(
-        pc_addr, line_end_addr, /*flavor_string=*/nullptr);
-
-    if (!insts.IsValid()) {
-      response["success"] = false;
-      response["message"] = "Failed to get instructions for frame.";
-      dap.SendJSON(llvm::json::Value(std::move(response)));
-      return;
-    }
+  const lldb::SBFrame frame = dap.GetLLDBFrame(args.frameId);
+  if (!frame.IsValid())
+    return llvm::make_error<DAPError>("Failed to get frame for input frameId.");
+
+  lldb::SBAddress pc_addr = frame.GetPCAddress();
+  lldb::SBAddress line_end_addr =
+      pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
+  lldb::SBInstructionList insts = dap.target.ReadInstructions(
+      pc_addr, line_end_addr, /*flavor_string=*/nullptr);
+
+  if (!insts.IsValid())
+    return llvm::make_error<DAPError>("Failed to get instructions for frame.");
+
+  StepInTargetsResponseBody body;
+  const size_t num_insts = insts.GetSize();
+  for (size_t i = 0; i < num_insts; ++i) {
+    lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
+    if (!inst.IsValid())
+      break;
+
+    const lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
+    if (inst_addr == LLDB_INVALID_ADDRESS)
+      break;
+
+    // Note: currently only x86/x64 supports flow kind.
+    const lldb::InstructionControlFlowKind flow_kind =
+        inst.GetControlFlowKind(dap.target);
+
+    if (flow_kind == lldb::eInstructionControlFlowKindCall) {
+
+      const llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
+      lldb::addr_t call_target_addr = LLDB_INVALID_ADDRESS;
+      if (call_operand_name.getAsInteger(0, call_target_addr))
+        continue;
+
+      const lldb::SBAddress call_target_load_addr =
+          dap.target.ResolveLoadAddress(call_target_addr);
+      if (!call_target_load_addr.IsValid())
+        continue;
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
+          call_target_load_addr, lldb::eSymbolContextFunction);
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      llvm::StringRef step_in_target_name;
+      if (sc.IsValid() && sc.GetFunction().IsValid())
+        step_in_target_name = sc.GetFunction().GetDisplayName();
+
+      // Skip call sites if we fail to resolve its symbol name.
+      if (step_in_target_name.empty())
+        continue;
 
-    llvm::json::Array step_in_targets;
-    const auto num_insts = insts.GetSize();
-    for (size_t i = 0; i < num_insts; ++i) {
-      lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
-      if (!inst.IsValid())
-        break;
-
-      lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
-
-      // Note: currently only x86/x64 supports flow kind.
-      lldb::InstructionControlFlowKind flow_kind =
-          inst.GetControlFlowKind(dap.target);
-      if (flow_kind == lldb::eInstructionControlFlowKindCall) {
-        // Use call site instruction address as id which is easy to debug.
-        llvm::json::Object step_in_target;
-        step_in_target["id"] = inst_addr;
-
-        llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
-        lldb::addr_t call_target_addr;
-        if (call_operand_name.getAsInteger(0, call_target_addr))
-          continue;
-
-        lldb::SBAddress call_target_load_addr =
-            dap.target.ResolveLoadAddress(call_target_addr);
-        if (!call_target_load_addr.IsValid())
-          continue;
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
-            call_target_load_addr, lldb::eSymbolContextFunction);
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        std::string step_in_target_name;
-        if (sc.IsValid() && sc.GetFunction().IsValid())
-          step_in_target_name = sc.GetFunction().GetDisplayName();
-
-        // Skip call sites if we fail to resolve its symbol name.
-        if (step_in_target_name.empty())
-          continue;
-
-        dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
-        step_in_target.try_emplace("label", step_in_target_name);
-        step_in_targets.emplace_back(std::move(step_in_target));
-      }
+      StepInTarget target;
+      target.id = inst_addr;
+      target.label = step_in_target_name;
+      dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
+      body.targets.emplace_back(std::move(target));
     }
-    llvm::json::Object body;
-    body.try_emplace("targets", std::move(step_in_targets));
-    response.try_emplace("body", std::move(body));
-  } else {
-    response["success"] = llvm::json::Value(false);
-    response["message"] = "Failed to get frame for input frameId.";
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+  return body;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 6cdde63e9796e..cf7db41559b8d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -482,15 +482,6 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name) {
   return event;
 }
 
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
-  protocol::ExceptionBreakpointsFilter filter;
-  filter.filter = bp.GetFilter();
-  filter.label = bp.GetLabel();
-  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
-  return filter;
-}
-
 // "StackFrame": {
 //   "type": "object",
 //   "description": "A Stackframe contains the source location.",
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 10dc46b94184f..69da0725bd05c 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -224,18 +224,6 @@ llvm::json::Value CreateModule(lldb::SBTarget &target, lldb::SBModule &module,
 ///     definition outlined by Microsoft.
 llvm::json::Object CreateEventObject(const llvm::StringRef event_name);
 
-/// Create a "ExceptionBreakpointsFilter" JSON object as described in
-/// the debug adapter definition.
-///
-/// \param[in] bp
-///     The exception breakpoint object to use
-///
-/// \return
-///     A "ExceptionBreakpointsFilter" JSON object with that follows
-///     the formal JSON definition outlined by Microsoft.
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
-
 /// Create a "StackFrame" object for a LLDB frame object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 2cb7c47d60203..e6ba54ed4dcd6 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -368,6 +368,16 @@ bool fromJSON(const json::Value &Params, StepInArguments &SIA, json::Path P) {
          OM.mapOptional("granularity", SIA.granularity);
 }
 
+bool fromJSON(const llvm::json::Value &Params, StepInTargetsArguments &SITA,
+              llvm::json::Path P) {
+  json::ObjectMapper OM(Params, P);
+  return OM && OM.map("frameId", SITA.frameId);
+}
+
+llvm::json::Value toJSON(const StepInTargetsResponseBody &SITR) {
+  return llvm::json::Object{{"targets", SITR.targets}};
+}
+
 bool fromJSON(const json::Value &Params, StepOutArguments &SOA, json::Path P) {
   json::ObjectMapper OM(Params, P);
   return OM && OM.map("threadId", SOA.threadId) &&
@@ -438,6 +448,20 @@ json::Value toJSON(const SetDataBreakpointsResponseBody &SDBR) {
   return json::Object{{"breakpoints", SDBR.breakpoints}};
 }
 
+bool fromJSON(const json::Value &Params, SetExceptionBreakpointsArguments &Args,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filters", Args.filters) &&
+         O.mapOptional("filterOptions", Args.filterOptions);
+}
+
+json::Value toJSON(const SetExceptionBreakpointsResponseBody &B) {
+  json::Object result;
+  if (!B.breakpoints.empty())
+    result.insert({"breakpoints", B.breakpoints});
+  return result;
+}
+
 json::Value toJSON(const ThreadsResponseBody &TR) {
   return json::Object{{"threads", TR.threads}};
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index d199cc886b11c..01b8f2445c9fa 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -533,6 +533,21 @@ bool fromJSON(const llvm::json::Value &, StepInArguments &, llvm::json::Path);
 /// body field is required.
 using StepInResponse = VoidResponse;
 
+/// Arguments for `stepInTargets` request.
+struct StepInTargetsArguments {
+  /// The stack frame for which to retrieve the possible step-in targets.
+  uint64_t frameId = LLDB_INVALID_FRAME_ID;
+};
+bool fromJSON(const llvm::json::Value &, StepInTargetsArguments &,
+              llvm::json::Path);
+
+/// Response to `stepInTargets` request.
+struct StepInTargetsResponseBody {
+  /// The possible step-in targets of the specified source location.
+  std::vector<StepInTarget> targets;
+};
+llvm::json::Value toJSON(const StepInTargetsResponseBody &);
+
 /// Arguments for `stepOut` request.
 struct StepOutArguments {
   /// Specifies the thread for which to resume execution for one step-out (of
@@ -736,6 +751,56 @@ struct SetDataBreakpointsResponseBody {
 };
 llvm::json::Value toJSON(const SetDataBreakpointsResponseBody &);
 
+/// Arguments for `setExceptionBreakpoints` request.
+struct SetExceptionBreakpointsArguments {
+  /// Set of exception filters specified by their ID. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. The `filter` and `filterOptions` sets are additive.
+  std::vector<std::string> filters;
+
+  /// Set of exception filters and their options. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. This attribute is only honored by a debug adapter if the
+  /// corresponding capability `supportsExceptionFilterOptions` is true. The
+  /// `filter` and `filterOptions` sets are additive.
+  std::vector<ExceptionFilterOptions> filterOptions;
+
+  // unsupported keys: exceptionOptions
+};
+bool fromJSON(const llvm::json::Value &, SetExceptionBreakpointsArguments &,
+              llvm::json::Path);
+
+/// Response to `setExceptionBreakpoints` request.
+///
+/// The response contains an array of `Breakpoint` objects with information
+/// about each exception breakpoint or filter. The `Breakpoint` objects are in
+/// the same order as the elements of the `filters`, `filterOptions`,
+/// `exceptionOptions` arrays given as arguments. If both `filters` and
+/// `filterOptions` are given, the returned array must start with `filters`
+/// information first, followed by `filterOptions` information.
+///
+/// The `verified` property of a `Breakpoint` object signals whether the
+/// exception breakpoint or filter could be successfully created and whether the
+/// condition is valid. In case of an error the `message` property explains the
+/// problem. The `id` property can be used to introduce a unique ID for the
+/// exception breakpoint or filter so that it can be updated subsequently by
+/// sending breakpoint events.
+///
+/// For backward compatibility both the `breakpoints` array and the enclosing
+/// `body` are optional. If these elements are missing a client is not able to
+/// show problems for individual exception breakpoints or filters.
+struct SetExceptionBreakpointsResponseBody {
+  /// Information about the exception breakpoints or filters.
+  ///
+  /// The breakpoints returned are in the same order as the elements of the
+  /// `filters`, `filterOptions`, `exceptionOptions` arrays in the arguments. If
+  /// both `filters` and `filterOptions` are given, the returned array must
+  /// start with `filters` information first, followed by `filterOptions`
+  /// information.
+  std::vector<Breakpoint> breakpoints;
+};
+llvm::json::Value toJSON(const SetExceptionBreakpointsResponseBody &);
+
 /// Arguments to `disassemble` request.
 struct DisassembleArguments {
   /// Memory reference to the base location containing the instructions to
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index 085d53bb006ef..7f96c07faae10 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -86,14 +86,14 @@ bool fromJSON(const llvm::json::Value &Params, ExceptionBreakpointsFilter &EBF,
 json::Value toJSON(const ExceptionBreakpointsFilter &EBF) {
   json::Object result{{"filter", EBF.filter}, {"label", EBF.label}};
 
-  if (EBF.description)
-    result.insert({"description", *EBF.description});
+  if (!EBF.description.empty())
+    result.insert({"description", EBF.description});
   if (EBF.defaultState)
-    result.insert({"default", *EBF.defaultState});
+    result.insert({"default", EBF.defaultState});
   if (EBF.supportsCondition)
-    result.insert({"supportsCondition", *EBF.supportsCondition});
-  if (EBF.conditionDescription)
-    result.insert({"conditionDescription", *EBF.conditionDescription});
+    result.insert({"supportsCondition", EBF.supportsCondition});
+  if (!EBF.conditionDescription.empty())
+    result.insert({"conditionDescription", EBF.conditionDescription});
 
   return result;
 }
@@ -418,23 +418,41 @@ json::Value toJSON(const Capabilities &C) {
   for (const auto &feature : C.supportedFeatures)
     result.insert({ToString(feature), true});
 
-  if (C.exceptionBreakpointFilters && !C.exceptionBreakpointFilters->empty())
+  if (!C.exceptionBreakpointFilters.empty())
+    result.insert({"exceptionBreakpointFilters", C.exceptionBreakpointFilters});
+  if (!C.completionTriggerCharacters.empty())
     result.insert(
-        {"exceptionBreakpointFilters", *C.exceptionBreakpointFilters});
-  if (C.completionTriggerCharacters && !C.completionTriggerCharacters->empty())
+        {"completionTriggerCharacters", C.completionTriggerCharacters});
+  if (!C.additionalModuleColumns.empty())
+    result.insert({"additionalModuleColumns", C.additionalModuleColumns});
+  if (!C.supportedChecksumAlgorithms.empty())
     result.insert(
-        {"completionTriggerCharacters", *C.completionTriggerCharacters});
-  if (C.additionalModuleColumns && !C.additionalModuleColumns->empty())
-    result.insert({"additionalModuleColumns", *C.additionalModuleColumns});
-  if (C.supportedChecksumAlgorithms && !C.supportedChecksumAlgorithms->empty())
-    result.insert(
-        {"supportedChecksumAlgorithms", *C.supportedChecksumAlgorithms});
-  if (C.breakpointModes && !C.breakpointModes->empty())
-    result.insert({"breakpointModes", *C.breakpointModes});
+        {"supportedChecksumAlgorithms", C.supportedChecksumAlgorithms});
+  if (!C.breakpointModes.empty())
+    result.insert({"breakpointModes", C.breakpointModes});
 
   // lldb-dap extensions
-  if (C.lldbExtVersion && !C.lldbExtVersion->empty())
-    result.insert({"$__lldb_version", *C.lldbExtVersion});
+  if (!C.lldbExtVersion.empty())
+    result.insert({"$__lldb_version", C.lldbExtVersion});
+
+  return result;
+}
+
+bool fromJSON(const json::Value &Params, ExceptionFilterOptions &EFO,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filterId", EFO.filterId) &&
+         O.mapOptional("condition", EFO.condition) &&
+         O.mapOptional("mode", EFO.mode);
+}
+
+json::Value toJSON(const ExceptionFilterOptions &EFO) {
+  json::Object result{{"filterId", EFO.filterId}};
+
+  if (!EFO.condition.empty())
+    result.insert({"condition", EFO.condition});
+  if (!EFO.mode.empty())
+    result.insert({"mode", EFO.mode});
 
   return result;
 }
@@ -582,6 +600,30 @@ llvm::json::Value toJSON(const SteppingGranularity &SG) {
   llvm_unreachable("unhandled stepping granularity.");
 }
 
+bool fromJSON(const json::Value &Params, StepInTarget &SIT, json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("id", SIT.id) && O.map("label", SIT.label) &&
+         O.mapOptional("line", SIT.line) &&
+         O.mapOptional("column", SIT.column) &&
+         O.mapOptional("endLine", SIT.endLine) &&
+         O.mapOptional("endColumn", SIT.endColumn);
+}
+
+llvm::json::Value toJSON(const StepInTarget &SIT) {
+  json::Object target{{"id", SIT.id}, {"label", SIT.label}};
+
+  if (SIT.line != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"line", SIT.line});
+  if (SIT.column != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"column", SIT.column});
+  if (SIT.endLine != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"endLine", SIT.endLine});
+  if (SIT.endLine != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"endColumn", SIT.endColumn});
+
+  return target;
+}
+
 bool fromJSON(const json::Value &Params, Thread &T, json::Path P) {
   json::ObjectMapper O(Params, P);
   return O && O.map("id", T.id) && O.map("name", T.name);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index c7acfc482987b..7fe7454113994 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -43,19 +43,19 @@ struct ExceptionBreakpointsFilter {
 
   /// A help text providing additional information about the exception filter.
   /// This string is typically shown as a hover and can be translated.
-  std::optional<std::string> description;
+  std::string description;
 
   /// Initial value of the filter option. If not specified a value false is
   /// assumed.
-  std::optional<bool> defaultState;
+  bool defaultState = false;
 
   /// Controls whether a condition can be specified for this filter option. If
   /// false or missing, a condition can not be set.
-  std::optional<bool> supportsCondition;
+  bool supportsCondition = false;
 
   /// A help text providing information about the condition. This string is
   /// shown as the placeholder text for a text box and can be translated.
-  std::optional<std::string> conditionDescription;
+  std::string conditionDescription;
 };
 bool fromJSON(const llvm::json::Value &, ExceptionBreakpointsFilter &,
               llvm::json::Path);
@@ -253,18 +253,17 @@ struct Capabilities {
 
   /// Available exception filter options for the `setExceptionBreakpoints`
   /// request.
-  std::optional<std::vector<ExceptionBreakpointsFilter>>
-      exceptionBreakpointFilters;
+  std::vector<ExceptionBreakpointsFilter> exceptionBreakpointFilters;
 
   /// The set of characters that should trigger completion in a REPL. If not
   /// specified, the UI should assume the `.` character.
-  std::optional<std::vector<std::string>> completionTriggerCharacters;
+  std::vector<std::string> completionTriggerCharacters;
 
   /// The set of additional module information exposed by the debug adapter.
-  std::optional<std::vector<ColumnDescriptor>> additionalModuleColumns;
+  std::vector<ColumnDescriptor> additionalModuleColumns;
 
   /// Checksum algorithms supported by the debug adapter.
-  std::optional<std::vector<ChecksumAlgorithm>> supportedChecksumAlgorithms;
+  std::vector<ChecksumAlgorithm> supportedChecksumAlgorithms;
 
   /// Modes of breakpoints supported by the debug adapter, such as 'hardware' or
   /// 'software'. If present, the client may allow the user to select a mode and
@@ -272,19 +271,39 @@ struct Capabilities {
   ///
   /// Clients may present the first applicable mode in this array as the
   /// 'default' mode in gestures that set breakpoints.
-  std::optional<std::vector<BreakpointMode>> breakpointModes;
+  std::vector<BreakpointMode> breakpointModes;
 
   /// lldb-dap Extensions
   /// @{
 
   /// The version of the adapter.
-  std::optional<std::string> lldbExtVersion;
+  std::string lldbExtVersion;
 
   /// @}
 };
 bool fromJSON(const llvm::json::Value &, Capabilities &, llvm::json::Path);
 llvm::json::Value toJSON(const Capabilities &);
 
+/// An `ExceptionFilterOptions` is used to specify an exception filter together
+/// with a condition for the `setExceptionBreakpoints` request.
+struct ExceptionFilterOptions {
+  /// ID of an exception filter returned by the `exceptionBreakpointFilters`
+  /// capability.
+  std::string filterId;
+
+  /// An expression for conditional exceptions.
+  /// The exception breaks into the debugger if the result of the condition is
+  /// true.
+  std::string condition;
+
+  /// The mode of this exception breakpoint. If defined, this must be one of the
+  /// `breakpointModes` the debug adapter advertised in its `Capabilities`.
+  std::string mode;
+};
+bool fromJSON(const llvm::json::Value &, ExceptionFilterOptions &,
+              llvm::json::Path);
+llvm::json::Value toJSON(const ExceptionFilterOptions &);
+
 /// A `Source` is a descriptor for source code. It is returned from the debug
 /// adapter as part of a `StackFrame` and it is used by clients when specifying
 /// breakpoints.
@@ -414,6 +433,34 @@ bool fromJSON(const llvm::json::Value &, SteppingGranularity &,
               llvm::json::Path);
 llvm::json::Value toJSON(const SteppingGranularity &);
 
+/// A `StepInTarget` can be used in the `stepIn` request and determines into
+/// which single target the `stepIn` request should step.
+struct StepInTarget {
+  /// Unique identifier for a step-in target.
+  lldb::addr_t id = LLDB_INVALID_ADDRESS;
+
+  /// The name of the step-in target (shown in the UI).
+  std::string label;
+
+  /// The line of the step-in target.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// Start position of the range covered by the step in target. It is measured
+  /// in UTF-16 code units and the client capability `columnsStartAt1`
+  /// determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The end line of the range covered by the step-in target.
+  uint32_t endLine = LLDB_INVALID_LINE_NUMBER;
+
+  /// End position of the range covered by the step in target. It is measured in
+  /// UTF-16 code units and the client capability `columnsStartAt1` determines
+  /// whether it is 0- or 1-based.
+  uint32_t endColumn = LLDB_INVALID_COLUMN_NUMBER;
+};
+bool fromJSON(const llvm::json::Value &, StepInTarget &, llvm::json::Path);
+llvm::json::Value toJSON(const StepInTarget &);
+
 /// A Thread.
 struct Thread {
   /// Unique identifier for the thread.
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 6e0adf5bc8b59..cb1ee6a424003 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -161,4 +161,15 @@ std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
   return threads;
 }
 
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
+  protocol::ExceptionBreakpointsFilter filter;
+  filter.filter = bp.GetFilter();
+  filter.label = bp.GetLabel();
+  filter.description = bp.GetLabel();
+  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
+  filter.supportsCondition = true;
+  return filter;
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h
index 2b2ac9e8e35fd..788d2fd054e2d 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.h
+++ b/lldb/tools/lldb-dap/ProtocolUtils.h
@@ -13,6 +13,7 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 
+#include "ExceptionBreakpoint.h"
 #include "Protocol/ProtocolTypes.h"
 
 #include "lldb/API/SBAddress.h"
@@ -74,6 +75,18 @@ protocol::Thread CreateThread(lldb::SBThread &thread, lldb::SBFormat &format);
 std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
                                          lldb::SBFormat &format);
 
+/// Create a "ExceptionBreakpointsFilter" JSON object as described in
+/// the debug adapter definition.
+///
+/// \param[in] bp
+///     The exception breakpoint object to use
+///
+/// \return
+///     A "ExceptionBreakpointsFilter" JSON object with that follows
+///     the formal JSON definition outlined by Microsoft.
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
+
 } // namespace lldb_dap
 
 #endif
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 0f51c4f935e33..b150dee792c34 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -290,7 +290,7 @@
         "language": "d"
       },
       {
-        "language": "fortan"
+        "language": "fortran"
       },
       {
         "language": "fortran-modern"
@@ -318,6 +318,22 @@
       {
         "type": "lldb-dap",
         "label": "LLDB DAP Debugger",
+        "languages": [
+          "ada",
+          "arm",
+          "c",
+          "cpp",
+          "crystal",
+          "d",
+          "fortran",
+          "fortran-modern",
+          "nim",
+          "objective-c",
+          "objectpascal",
+          "pascal",
+          "rust",
+          "swift"
+        ],
         "configurationAttributes": {
           "launch": {
             "required": [
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index adf43c9ac2046..46a09f090fea2 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -243,14 +243,12 @@ TEST(ProtocolTypesTest, Capabilities) {
             deserialized_capabilities->supportedFeatures);
 
   // Verify exception breakpoint filters.
-  ASSERT_TRUE(
-      deserialized_capabilities->exceptionBreakpointFilters.has_value());
-  EXPECT_EQ(capabilities.exceptionBreakpointFilters->size(),
-            deserialized_capabilities->exceptionBreakpointFilters->size());
-  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters->size(); ++i) {
-    const auto &original = capabilities.exceptionBreakpointFilters->at(i);
+  EXPECT_EQ(capabilities.exceptionBreakpointFilters.size(),
+            deserialized_capabilities->exceptionBreakpointFilters.size());
+  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters.size(); ++i) {
+    const auto &original = capabilities.exceptionBreakpointFilters.at(i);
     const auto &deserialized =
-        deserialized_capabilities->exceptionBreakpointFilters->at(i);
+        deserialized_capabilities->exceptionBreakpointFilters.at(i);
     EXPECT_EQ(original.filter, deserialized.filter);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -260,19 +258,16 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify completion trigger characters.
-  ASSERT_TRUE(
-      deserialized_capabilities->completionTriggerCharacters.has_value());
   EXPECT_EQ(capabilities.completionTriggerCharacters,
             deserialized_capabilities->completionTriggerCharacters);
 
   // Verify additional module columns.
-  ASSERT_TRUE(deserialized_capabilities->additionalModuleColumns.has_value());
-  EXPECT_EQ(capabilities.additionalModuleColumns->size(),
-            deserialized_capabilities->additionalModuleColumns->size());
-  for (size_t i = 0; i < capabilities.additionalModuleColumns->size(); ++i) {
-    const auto &original = capabilities.additionalModuleColumns->at(i);
+  EXPECT_EQ(capabilities.additionalModuleColumns.size(),
+            deserialized_capabilities->additionalModuleColumns.size());
+  for (size_t i = 0; i < capabilities.additionalModuleColumns.size(); ++i) {
+    const auto &original = capabilities.additionalModuleColumns.at(i);
     const auto &deserialized =
-        deserialized_capabilities->additionalModuleColumns->at(i);
+        deserialized_capabilities->additionalModuleColumns.at(i);
     EXPECT_EQ(original.attributeName, deserialized.attributeName);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.format, deserialized.format);
@@ -281,19 +276,15 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify supported checksum algorithms.
-  ASSERT_TRUE(
-      deserialized_capabilities->supportedChecksumAlgorithms.has_value());
   EXPECT_EQ(capabilities.supportedChecksumAlgorithms,
             deserialized_capabilities->supportedChecksumAlgorithms);
 
   // Verify breakpoint modes.
-  ASSERT_TRUE(deserialized_capabilities->breakpointModes.has_value());
-  EXPECT_EQ(capabilities.breakpointModes->size(),
-            deserialized_capabilities->breakpointModes->size());
-  for (size_t i = 0; i < capabilities.breakpointModes->size(); ++i) {
-    const auto &original = capabilities.breakpointModes->at(i);
-    const auto &deserialized =
-        deserialized_capabilities->breakpointModes->at(i);
+  EXPECT_EQ(capabilities.breakpointModes.size(),
+            deserialized_capabilities->breakpointModes.size());
+  for (size_t i = 0; i < capabilities.breakpointModes.size(); ++i) {
+    const auto &original = capabilities.breakpointModes.at(i);
+    const auto &deserialized = deserialized_capabilities->breakpointModes.at(i);
     EXPECT_EQ(original.mode, deserialized.mode);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -301,7 +292,6 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify lldb extension version.
-  ASSERT_TRUE(deserialized_capabilities->lldbExtVersion.has_value());
   EXPECT_EQ(capabilities.lldbExtVersion,
             deserialized_capabilities->lldbExtVersion);
 }
@@ -686,3 +676,92 @@ TEST(ProtocolTypesTest, CapabilitiesEventBody) {
   // Validate toJSON
   EXPECT_EQ(json, pp(body));
 }
+
+TEST(ProtocolTypesTest, ExceptionFilterOptions) {
+  EXPECT_THAT_EXPECTED(parse<ExceptionFilterOptions>(R"({"filterId":"id"})"),
+                       HasValue(Value(ExceptionFilterOptions{
+                           /*filterId=*/"id", /*condition=*/"", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":"1+2"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(
+          R"({"filterId":"id","condition":"1+2","mode":"m"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ "m"})));
+
+  // Validate parsing errors
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({})", "exceptionFilterOptions"),
+      FailedWithMessage("missing value at exceptionFilterOptions.filterId"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.condition"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","mode":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.mode"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsArguments) {
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":[]})"),
+      HasValue(testing::FieldsAre(/*filters=*/testing::IsEmpty(),
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":["abc"]})"),
+      HasValue(testing::FieldsAre(/*filters=*/std::vector<std::string>{"abc"},
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(
+          R"({"filters":[],"filterOptions":[{"filterId":"abc"}]})"),
+      HasValue(testing::FieldsAre(
+          /*filters=*/testing::IsEmpty(),
+          /*filterOptions=*/testing::Contains(testing::FieldsAre(
+              /*filterId=*/"abc", /*condition=*/"", /*mode=*/"")))));
+
+  // Validate parse errors
+  EXPECT_THAT_EXPECTED(parse<SetExceptionBreakpointsArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).filters"));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":false})"),
+      FailedWithMessage("expected array at (root).filters"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsResponseBody) {
+  SetExceptionBreakpointsResponseBody body;
+  Breakpoint bp;
+  bp.id = 12, bp.verified = true;
+  body.breakpoints = {bp};
+  EXPECT_EQ(R"({
+  "breakpoints": [
+    {
+      "id": 12,
+      "verified": true
+    }
+  ]
+})",
+            pp(body));
+}
+
+TEST(ProtocolTypesTest, StepInTarget) {
+  StepInTarget target;
+  target.id = 230;
+  target.label = "the_function_name";
+  target.line = 2;
+  target.column = 320;
+  target.endLine = 32;
+  target.endColumn = 23;
+
+  llvm::Expected<StepInTarget> deserialized_target = roundtrip(target);
+  ASSERT_THAT_EXPECTED(deserialized_target, llvm::Succeeded());
+
+  EXPECT_EQ(target.id, deserialized_target->id);
+  EXPECT_EQ(target.label, deserialized_target->label);
+  EXPECT_EQ(target.line, deserialized_target->line);
+  EXPECT_EQ(target.column, deserialized_target->column);
+  EXPECT_EQ(target.endLine, deserialized_target->endLine);
+  EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
+}
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 9895469973e47..0fcd73e752311 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -19,6 +19,7 @@ if (ANDROID OR CYGWIN OR CMAKE_SYSTEM_NAME MATCHES "AIX|DragonFly|FreeBSD|Haiku|
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (APPLE)
   set(HAVE_MACH_MACH_H 1)
   set(HAVE_MALLOC_MALLOC_H 1)
@@ -26,6 +27,7 @@ elseif (APPLE)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (WIN32)
   set(HAVE_MACH_MACH_H 0)
   set(HAVE_MALLOC_MALLOC_H 0)
@@ -33,6 +35,7 @@ elseif (WIN32)
   set(HAVE_SYS_MMAN_H 0)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 0)
+  set(HAVE_SYS_IOCTL_H 0)
 elseif (ZOS)
   # Confirmed in
   # https://github.com/llvm/llvm-project/pull/104706#issuecomment-2297109613
@@ -42,6 +45,7 @@ elseif (ZOS)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 else()
   # Other platforms that we don't promise support for.
   check_include_file(mach/mach.h HAVE_MACH_MACH_H)
@@ -50,6 +54,7 @@ else()
   check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
   check_include_file(sysexits.h HAVE_SYSEXITS_H)
   check_include_file(unistd.h HAVE_UNISTD_H)
+  check_include_file(sys/ioctl.h HAVE_SYS_IOCTL_H)
 endif()
 
 if( UNIX AND NOT (APPLE OR BEOS OR HAIKU) )
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 7a7340ff8a456..8d8a94d1cddc4 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2192,7 +2192,7 @@ endfunction()
 
 function(add_lit_testsuites project directory)
   if (NOT LLVM_ENABLE_IDE)
-    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER" "PARAMS;DEPENDS;ARGS" ${ARGN})
+    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER;BINARY_DIR" "PARAMS;DEPENDS;ARGS" ${ARGN})
 
     if (NOT ARG_FOLDER)
       get_subproject_title(subproject_title)
@@ -2213,13 +2213,18 @@ function(add_lit_testsuites project directory)
       endif()
 
       # Create a check- target for the directory.
-      string(REPLACE ${directory} "" name_slash ${lit_suite})
+      string(REPLACE "${directory}/" "" name_slash ${lit_suite})
       if (name_slash)
+        set(filter ${name_slash})
         string(REPLACE "/" "-" name_slash ${name_slash})
         string(REPLACE "\\" "-" name_dashes ${name_slash})
-        string(TOLOWER "${project}${name_dashes}" name_var)
+        string(TOLOWER "${project}-${name_dashes}" name_var)
+        set(lit_args ${lit_suite})
+        if (ARG_BINARY_DIR)
+          set(lit_args ${ARG_BINARY_DIR} --filter=${filter})
+        endif()
         add_lit_target("check-${name_var}" "Running lit suite ${lit_suite}"
-          ${lit_suite}
+          ${lit_args}
           ${EXCLUDE_FROM_CHECK_ALL}
           PARAMS ${ARG_PARAMS}
           DEPENDS ${ARG_DEPENDS}
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 743eb6f5529f2..8004d3571fc8a 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -199,17 +199,17 @@ endif()
 string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING)
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
-  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default.
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0 )
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING: \"${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}\"!")
 endif()
-# LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING (non-cached) is expected to be
+# LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE (non-cached) is expected to be
 # 1 or 0 here, assuming referenced in #cmakedefine01.
 
 if(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 39f04f8e01b85..c052b076c21c3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2709,7 +2709,8 @@ The following relocation types are supported:
 the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
 
 There is no current OS loader support for 32-bit programs and so
-``R_AMDGPU_ABS32`` is not used.
+``R_AMDGPU_ABS32`` is only generated for static relocations, for example to
+implement some DWARF32 forms.
 
 .. _amdgpu-loaded-code-object-path-uniform-resource-identifier:
 
diff --git a/llvm/docs/InstCombineContributorGuide.md b/llvm/docs/InstCombineContributorGuide.md
index b4041f8a5b93f..cee0a7ce446a6 100644
--- a/llvm/docs/InstCombineContributorGuide.md
+++ b/llvm/docs/InstCombineContributorGuide.md
@@ -404,11 +404,32 @@ The use of TargetTransformInfo is only allowed for hooks for target-specific
 intrinsics, such as `TargetTransformInfo::instCombineIntrinsic()`. These are
 already inherently target-dependent anyway.
 
+If some canonicalization narrow/widen the integer width of expressions, please
+check `shouldChangeType()` first. Otherwise, we may evaluate the expression 
+in illegal/inefficient types.
+
 For vector-specific transforms that require cost-modelling, the VectorCombine
 pass can be used instead. In very rare circumstances, if there are no other
 alternatives, target-dependent transforms may be accepted into
 AggressiveInstCombine.
 
+Generally, we prefer unsigned operations over signed operations in the middle-end, even
+if signed operations are more efficient on some targets. The following is an incomplete
+list of canonicalizations that are implemented in InstCombine:
+
+| Original Pattern             | Canonical Form             | Condition                     |
+|------------------------------|----------------------------|-------------------------------|
+| `icmp spred X, Y`            | `icmp samesign upred X, Y` | `sign(X) == sign(Y)`          |
+| `smin/smax X, Y`             | `umin/umax X, Y`           | `sign(X) == sign(Y)`          |
+| `sext X`                     | `zext nneg X`              | `X >=s 0`                     |
+| `sitofp X`                   | `uitofp nneg X`            | `X >=s 0`                     |
+| `ashr X, Y`                  | `lshr X, Y`                | `X >=s 0`                     |
+| `sdiv/srem X, Y`             | `udiv/urem X, Y`           | `X >=s 0 && Y >=s 0`          |
+| `add X, Y`                   | `or disjoint X, Y`         | `(X & Y) != 0`                |
+| `mul X, C`                   | `shl X, Log2(C)`           | `isPowerOf2(C)`               |
+| `select Cond1, Cond2, false` | `and Cond1, Cond2`         | `impliesPoison(Cond2, Cond1)` |
+| `select Cond1, true, Cond2`  | `or Cond1, Cond2`          | `impliesPoison(Cond2, Cond1)` |
+
 ### PatternMatch
 
 Many transforms make use of the matching infrastructure defined in
@@ -531,6 +552,19 @@ need to add a one-use check for the inner instruction.
 One-use checks can be performed using the `m_OneUse()` matcher, or the
 `V->hasOneUse()` method.
 
+### Flag handling
+
+When possible, favour propagation of poison-generating flags like `nuw` and `nsw` since they may be
+hard to salvage later. Avoid doing so if it introduces additional complexity (e.g. requires querying `willNotOverflow`
+or KnownBits).
+
+Be careful with in-place operand/predicate changes, as poison-generating flags may not be valid for new
+operands. It is recommended to create a new instruction with careful handling of flags. If not
+applicable, call `Instruction::dropPoisonGeneratingFlags()` to clear flags in a conservative manner.
+
+Do not rely on fcmp's `nsz` flag to perform optimizations. It is meaningless for fcmp so it should not affect
+the optimization.
+
 ### Generalization
 
 Transforms can both be too specific (only handling some odd subset of patterns,
@@ -558,6 +592,11 @@ guidelines.
    use of ValueTracking queries. Whether this makes sense depends on the case,
    but it's usually a good idea to only handle the constant pattern first, and
    then generalize later if it seems useful.
+ * When possible, handle more canonical patterns as well. It is encouraged to avoid
+   potential phase-ordering issues. For example, if the motivating transform holds for
+   `add`, it also holds for `or disjoint`. See the canonicalization list above for details.
+   In most cases, it can be easily implemented with matchers like
+   `m_AddLike/m_SExtLike/m_LogicalAnd/m_LogicalOr`.
 
 ## Guidelines for reviewers
 
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 64f17f59575ea..81684ba30f12c 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -442,61 +442,61 @@ The current vendor extensions supported are:
   LLVM implements `the custom compressed opcodes present in some QingKe cores` by WCH / Nanjing Qinheng Microelectronics. The vendor refers to these opcodes by the name "XW".
 
 ``experimental-Xqccmp``
-  LLVM implements `version 0.1 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.1.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
+  LLVM implements `version 0.3 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.3.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
 
 ``experimental-Xqcia``
-  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciac``
-  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibi``
-  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibm``
-  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicli``
-  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicm``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcics``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicsr``
-  LLVM implements `version 0.3 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.4 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciint``
-  LLVM implements `version 0.7 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.10 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciio``
-  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilb``
-  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcili``
-  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilia``
-  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilo``
-  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilsm``
   LLVM implements `version 0.6 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisim``
-  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisls``
-  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisync``
-  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``Xmipscmov``
   LLVM implements conditional move for the `p8700 processor <https://mips.com/products/hardware/p8700/>`__ by MIPS.
@@ -513,6 +513,9 @@ The current vendor extensions supported are:
 ``XAndesPerf``
   LLVM implements `version 5.0.0 of the Andes Performance Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
+``XAndesVBFHCvt``
+  LLVM implements `version 5.0.0 of the Andes Vector BFLOAT16 Conversion Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
+
 ``XAndesVPackFPH``
   LLVM implements `version 5.0.0 of the Andes Vector Packed FP16 Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 5c9ed181af59e..0395f43c61953 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -210,6 +210,7 @@ Changes to the RISC-V Backend
 * The `Shlcofideleg` extension was added.
 * `-mcpu=sifive-x390` was added.
 * `-mtune=andes-45-series` was added.
+* Adds assembler support for the Andes `XAndesvbfhcvt` (Andes Vector BFLOAT16 Conversion extension).
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/include/llvm-c/ExecutionEngine.h b/llvm/include/llvm-c/ExecutionEngine.h
index c5fc9bdb4d07f..2062cbf470d8b 100644
--- a/llvm/include/llvm-c/ExecutionEngine.h
+++ b/llvm/include/llvm-c/ExecutionEngine.h
@@ -33,7 +33,15 @@ LLVM_C_EXTERN_C_BEGIN
  * @{
  */
 
+/**
+ * Empty function used to force the linker to link MCJIT.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInMCJIT(void);
+/**
+ * Empty function used to force the linker to link the LLVM interpreter.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInInterpreter(void);
 
 typedef struct LLVMOpaqueGenericValue *LLVMGenericValueRef;
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 715df7ab9a7aa..c2460c497b40b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-﻿//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index dfda2dcee0db1..09a8875e1e28c 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -545,10 +545,6 @@ struct IRInstructionMapper {
     // dependent.
     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
-    // DebugInfo should be included in the regions, but should not be
-    // analyzed for similarity as it has no bearing on the outcome of the
-    // program.
-    InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
     InstrType visitIntrinsicInst(IntrinsicInst &II) {
       // These are disabled due to complications in the CodeExtractor when
       // outlining these instructions.  For instance, It is unclear what we
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index c9d3874e7dd96..0858d8aee2186 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -285,7 +285,6 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   // No-op intrinsics which we know don't escape the pointer to logic in
   // some other function.
-  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {}
   void visitMemIntrinsic(MemIntrinsic &I) {}
   void visitIntrinsicInst(IntrinsicInst &II) {
     switch (II.getIntrinsicID()) {
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 68753a2497db2..4015df990729f 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -237,6 +237,266 @@ TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v")
 
+#elif defined(TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS)
+
+TLI_DEFINE_VECFUNC("acos", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("acosh", "_ZGVnN2v_acosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN2v_acoshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN4v_acoshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosh", "_ZGVsMxv_acosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVsMxv_acoshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asin", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asinh", "_ZGVnN2v_asinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN2v_asinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN4v_asinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinh", "_ZGVsMxv_asinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVsMxv_asinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan2", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("atanh", "_ZGVnN2v_atanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN2v_atanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN4v_atanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVnN2v_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN2v_cbrtf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN4v_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVsMxv_cbrt",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVsMxv_cbrtf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cos", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cosh", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVsMxv_cosh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erf", "_ZGVnN2v_erf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN2v_erff", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN4v_erff", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erf", "_ZGVsMxv_erf",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erff", "_ZGVsMxv_erff", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erfc", "_ZGVnN2v_erfc", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN2v_erfcf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN4v_erfcf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfc", "_ZGVsMxv_erfc",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVsMxv_erfcf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp10", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVsMxv_exp10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp2", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("expm1", "_ZGVnN2v_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN2v_expm1f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN4v_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1", "_ZGVsMxv_expm1",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVsMxv_expm1f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("hypot", "_ZGVnN2vv_hypot", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN2vv_hypotf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN4vv_hypotf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypot", "_ZGVsMxvv_hypot", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVsMxvv_hypotf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("log", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log10", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log1p", "_ZGVnN2v_log1p", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN2v_log1pf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN4v_log1pf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1p", "_ZGVsMxv_log1p",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVsMxv_log1pf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log2", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2", "_ZGVsMxv_log2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("pow", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("sin", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("sinh", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVsMxv_sinh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tan", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVsMxv_tan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tanh", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVsMxv_tanh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
 #elif defined(TLI_DEFINE_MASSV_VECFUNCS)
 // IBM MASS library's vector Functions
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 08949e39716d5..6d625dad5853f 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -154,17 +154,17 @@ enum class FeatureFlags : uint64_t {
 static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_ELEMENT_FLAG(Num, Val) Val = Num,
 enum class RootElementFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = Num,
 enum class RootDescriptorFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = 1ull << Num,
+#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = Num,
 enum class DescriptorRangeFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 501ef0c31cdd0..18e79e6fa65a5 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -59,18 +59,19 @@ SHADER_FEATURE_FLAG(33, 39, NextUnusedBit, "Next reserved shader flag bit (not a
 // ROOT_ELEMENT_FLAG(bit offset for the flag, name).
 #ifdef ROOT_ELEMENT_FLAG
 
-ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout)
-ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess)
-ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess)
-ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess)
-ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess)
-ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess)
-ROOT_ELEMENT_FLAG(6, AllowStreamOutput)
-ROOT_ELEMENT_FLAG(7, LocalRootSignature)
-ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess)
-ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess)
-ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed)
-ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0, NONE)
+ROOT_ELEMENT_FLAG(0x1, AllowInputAssemblerInputLayout)
+ROOT_ELEMENT_FLAG(0x2, DenyVertexShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x4, DenyHullShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x8, DenyDomainShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x10, DenyGeometryShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x20, DenyPixelShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x40, AllowStreamOutput)
+ROOT_ELEMENT_FLAG(0x80, LocalRootSignature)
+ROOT_ELEMENT_FLAG(0x100, DenyAmplificationShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x200, DenyMeshShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x400, CBVSRVUAVHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0x800, SamplerHeapDirectlyIndexed)
 #undef ROOT_ELEMENT_FLAG
 #endif // ROOT_ELEMENT_FLAG
 
@@ -79,9 +80,9 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
 #ifdef ROOT_DESCRIPTOR_FLAG
 
 ROOT_DESCRIPTOR_FLAG(0, NONE)
-ROOT_DESCRIPTOR_FLAG(1, DATA_VOLATILE)
-ROOT_DESCRIPTOR_FLAG(2, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
+ROOT_DESCRIPTOR_FLAG(0x2, DATA_VOLATILE)
+ROOT_DESCRIPTOR_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+ROOT_DESCRIPTOR_FLAG(0x8, DATA_STATIC)
 #undef ROOT_DESCRIPTOR_FLAG
 #endif // ROOT_DESCRIPTOR_FLAG
 
@@ -90,11 +91,11 @@ ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
 #ifdef DESCRIPTOR_RANGE_FLAG
 
 DESCRIPTOR_RANGE_FLAG(0, NONE)
-DESCRIPTOR_RANGE_FLAG(1, DESCRIPTORS_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(2, DATA_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(3, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-DESCRIPTOR_RANGE_FLAG(4, DATA_STATIC)
-DESCRIPTOR_RANGE_FLAG(16, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
+DESCRIPTOR_RANGE_FLAG(0x1, DESCRIPTORS_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x2, DATA_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+DESCRIPTOR_RANGE_FLAG(0x8, DATA_STATIC)
+DESCRIPTOR_RANGE_FLAG(0x10000, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
 #undef DESCRIPTOR_RANGE_FLAG
 #endif // DESCRIPTOR_RANGE_FLAG
 
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 2ead62025efa7..231b7ac17d75f 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -1191,32 +1191,32 @@ template <typename Enum> struct EnumTraits : public std::false_type {};
 
 template <> struct EnumTraits<Attribute> : public std::true_type {
   static constexpr char Type[3] = "AT";
-  static constexpr StringRef (*StringFn)(unsigned) = &AttributeString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Form> : public std::true_type {
   static constexpr char Type[5] = "FORM";
-  static constexpr StringRef (*StringFn)(unsigned) = &FormEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Index> : public std::true_type {
   static constexpr char Type[4] = "IDX";
-  static constexpr StringRef (*StringFn)(unsigned) = &IndexString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Tag> : public std::true_type {
   static constexpr char Type[4] = "TAG";
-  static constexpr StringRef (*StringFn)(unsigned) = &TagString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LineNumberOps> : public std::true_type {
   static constexpr char Type[4] = "LNS";
-  static constexpr StringRef (*StringFn)(unsigned) = &LNStandardString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LocationAtom> : public std::true_type {
   static constexpr char Type[3] = "OP";
-  static constexpr StringRef (*StringFn)(unsigned) = &OperationEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 inline uint64_t computeTombstoneAddress(uint8_t AddressByteSize) {
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
index 7ae3d3f205772..b02462ca89fdd 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -25,4 +25,4 @@
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
-ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_CALL_PLT, 195)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..3b87978fe3fab 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -478,12 +478,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
   }
 
   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
   }
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 2e3807a2dfffd..d413227c4d961 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -938,6 +938,10 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::TRUNCATE, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Abs(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ABS, Op);
+}
+
 /// Match a zext or identity
 /// Allows to peek through optional extensions
 template <typename Opnd> inline auto m_ZExtOrSelf(const Opnd &Op) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4ed81d25e8e22..dd44afd0855a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3574,20 +3574,18 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    CmpLibcallCCs[Call] = CC;
+  /// FIXME: This should be removed
+  void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) {
+    Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred);
   }
 
-
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return CmpLibcallCCs[Call];
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return Libcalls.getSoftFloatCmpLibcallPredicate(Call);
   }
 
-
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     Libcalls.setLibcallCallingConv(Call, CC);
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 06d4756397911..ce83de8e4cba9 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6d3c37cc8b194..a0ad517a6ecf4 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
new file mode 100644
index 0000000000000..32e8247ac4c22
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
@@ -0,0 +1,28 @@
+//===- DWARFCFIPrinter.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+#define LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+
+namespace llvm {
+
+struct DIDumpOptions;
+
+namespace dwarf {
+
+void printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                     const DIDumpOptions &DumpOpts, unsigned IndentLevel,
+                     std::optional<uint64_t> Address);
+
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
index 24a0f389470db..ad7358c28f16b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
@@ -24,6 +24,7 @@
 namespace llvm {
 
 namespace dwarf {
+
 /// Represent a sequence of Call Frame Information instructions that, when read
 /// in order, construct a table mapping PC to frame state. This can also be
 /// referred to as "CFI rules" in DWARF literature to avoid confusion with
@@ -80,15 +81,37 @@ class CFIProgram {
   LLVM_ABI Error parse(DWARFDataExtractor Data, uint64_t *Offset,
                        uint64_t EndOffset);
 
-  LLVM_ABI void dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                     unsigned IndentLevel,
-                     std::optional<uint64_t> InitialLocation) const;
-
   void addInstruction(const Instruction &I) { Instructions.push_back(I); }
 
   /// Get a DWARF CFI call frame string for the given DW_CFA opcode.
   LLVM_ABI StringRef callFrameString(unsigned Opcode) const;
 
+  /// Types of operands to CFI instructions
+  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
+  /// thus this type doesn't need to be explicitly written to the file (this is
+  /// not a DWARF encoding). The relationship of instrs to operand types can
+  /// be obtained from getOperandTypes() and is only used to simplify
+  /// instruction printing and error messages.
+  enum OperandType {
+    OT_Unset,
+    OT_None,
+    OT_Address,
+    OT_Offset,
+    OT_FactoredCodeOffset,
+    OT_SignedFactDataOffset,
+    OT_UnsignedFactDataOffset,
+    OT_Register,
+    OT_AddressSpace,
+    OT_Expression
+  };
+
+  /// Get the OperandType as a "const char *".
+  static const char *operandTypeString(OperandType OT);
+
+  /// Retrieve the array describing the types of operands according to the enum
+  /// above. This is indexed by opcode.
+  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
+
 private:
   std::vector<Instruction> Instructions;
   const uint64_t CodeAlignmentFactor;
@@ -121,37 +144,6 @@ class CFIProgram {
     Instructions.back().Ops.push_back(Operand2);
     Instructions.back().Ops.push_back(Operand3);
   }
-
-  /// Types of operands to CFI instructions
-  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
-  /// thus this type doesn't need to be explicitly written to the file (this is
-  /// not a DWARF encoding). The relationship of instrs to operand types can
-  /// be obtained from getOperandTypes() and is only used to simplify
-  /// instruction printing.
-  enum OperandType {
-    OT_Unset,
-    OT_None,
-    OT_Address,
-    OT_Offset,
-    OT_FactoredCodeOffset,
-    OT_SignedFactDataOffset,
-    OT_UnsignedFactDataOffset,
-    OT_Register,
-    OT_AddressSpace,
-    OT_Expression
-  };
-
-  /// Get the OperandType as a "const char *".
-  static const char *operandTypeString(OperandType OT);
-
-  /// Retrieve the array describing the types of operands according to the enum
-  /// above. This is indexed by opcode.
-  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
-
-  /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-  void printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                    const Instruction &Instr, unsigned OperandIdx,
-                    uint64_t Operand, std::optional<uint64_t> &Address) const;
 };
 
 } // end namespace dwarf
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 2f028817b45b6..9dfbd3cb68928 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DXILABI.h"
+#include <limits>
 #include <variant>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 6d959ad5bdc7f..25c2a9f0cc808 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -15,6 +15,7 @@
 #define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATUREUTILS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
@@ -27,11 +28,22 @@ class Metadata;
 namespace hlsl {
 namespace rootsig {
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootConstants &Constants);
+
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
                                  const DescriptorTableClause &Clause);
 
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const DescriptorTable &Table);
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootDescriptor &Descriptor);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const StaticSampler &StaticSampler);
+
 LLVM_ABI void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements);
 
 class MetadataBuilder {
@@ -59,6 +71,62 @@ class MetadataBuilder {
   SmallVector<Metadata *> GeneratedMetadata;
 };
 
+// RangeInfo holds the information to correctly construct a ResourceRange
+// and retains this information to be used for displaying a better diagnostic
+struct RangeInfo {
+  const static uint32_t Unbounded = ~0u;
+
+  uint32_t LowerBound;
+  uint32_t UpperBound;
+};
+
+class ResourceRange {
+public:
+  using MapT = llvm::IntervalMap<uint32_t, const RangeInfo *, 16,
+                                 llvm::IntervalMapInfo<uint32_t>>;
+
+private:
+  MapT Intervals;
+
+public:
+  ResourceRange(MapT::Allocator &Allocator) : Intervals(MapT(Allocator)) {}
+
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  std::optional<const RangeInfo *> getOverlapping(const RangeInfo &Info) const;
+
+  // Return the mapped RangeInfo at X or nullptr if no mapping exists
+  const RangeInfo *lookup(uint32_t X) const;
+
+  // Insert the required (sub-)intervals such that the interval of [a;b] =
+  // [Info.LowerBound, Info.UpperBound] is covered and points to a valid
+  // RangeInfo &.
+  //
+  // For instance consider the following chain of inserting RangeInfos with the
+  // intervals denoting the Lower/Upper-bounds:
+  //
+  // A = [0;2]
+  //   insert(A) -> false
+  //   intervals: [0;2] -> &A
+  // B = [5;7]
+  //   insert(B) -> false
+  //   intervals: [0;2] -> &A, [5;7] -> &B
+  // C = [4;7]
+  //   insert(C) -> true
+  //   intervals: [0;2] -> &A, [4;7] -> &C
+  // D = [1;5]
+  //   insert(D) -> true
+  //   intervals: [0;2] -> &A, [3;3] -> &D, [4;7] -> &C
+  // E = [0;unbounded]
+  //   insert(E) -> true
+  //   intervals: [0;unbounded] -> E
+  //
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  // (equivalent to getOverlapping)
+  std::optional<const RangeInfo *> insert(const RangeInfo &Info);
+};
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index e0714e812e5cd..de888ff86fe91 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -701,7 +701,7 @@ template <typename T, typename I, typename E> //
 struct IndirectT {
   using InvokedByFptr = E;
   using WrapperTrait = std::true_type;
-  InvokedByFptr v;
+  OPT(InvokedByFptr) v;
 };
 
 // V5.2: [14.1.2] `init` clause
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 027692275b63b..a87111cb5a11d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -246,6 +246,7 @@ def OMPC_Inclusive : Clause<[Spelling<"inclusive">]> {
   let flangClass = "OmpObjectList";
 }
 def OMPC_Indirect : Clause<[Spelling<"indirect">]> {
+  let flangClass = "OmpIndirectClause";
 }
 def OMPC_Init : Clause<[Spelling<"init">]> {
   let clangClass = "OMPInitClause";
@@ -646,7 +647,7 @@ def OMP_EndAssumes : Directive<[Spelling<"end assumes">]> {
 def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_DeviceType>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];
@@ -724,7 +725,7 @@ def OMP_DeclareSimd : Directive<[Spelling<"declare simd">]> {
 def OMP_DeclareTarget : Directive<[Spelling<"declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_Enter, 52>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e4b1241151e9d..93fb0d8e8d078 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2507,7 +2507,7 @@ class OpenMPIRBuilder {
       TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
       const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-      bool HasNoWait);
+      const TargetDataRTArgs &RTArgs, bool HasNoWait);
 
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index 2fabae9bfc66e..999e03b6374a5 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -26,7 +26,7 @@ namespace llvm {
   class DILocation;
   class Function;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   // Used to represent different "kinds" of DebugLoc, expressing that the
   // instruction it is part of is either normal and should contain a valid
   // DILocation, or otherwise describing the reason why the instruction does
@@ -90,7 +90,7 @@ namespace llvm {
   using DebugLocTrackingRef = DILocAndCoverageTracking;
 #else
   using DebugLocTrackingRef = TrackingMDNodeRef;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
   /// A debug info location.
   ///
@@ -117,12 +117,12 @@ namespace llvm {
     /// IR.
     LLVM_ABI explicit DebugLoc(const MDNode *N);
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     DebugLoc(DebugLocKind Kind) : Loc(Kind) {}
     DebugLocKind getKind() const { return Loc.Kind; }
 #endif
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     static inline DebugLoc getTemporary() {
       return DebugLoc(DebugLocKind::Temporary);
     }
@@ -140,7 +140,7 @@ namespace llvm {
     static inline DebugLoc getUnknown() { return DebugLoc(); }
     static inline DebugLoc getCompilerGenerated() { return DebugLoc(); }
     static inline DebugLoc getDropped() { return DebugLoc(); }
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
     /// When two instructions are combined into a single instruction we also
     /// need to combine the original locations into a single location.
@@ -174,7 +174,7 @@ namespace llvm {
     DebugLoc orElse(DebugLoc Other) const {
       if (*this)
         return *this;
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
       if (Other)
         return Other;
       if (getKind() != DebugLocKind::Normal)
@@ -184,7 +184,7 @@ namespace llvm {
       return *this;
 #else
       return Other;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     }
 
     /// Get the underlying \a DILocation.
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index b4eb729c7ce38..6d5398bb7a4cd 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -199,13 +199,6 @@ class InstVisitor {
   RetTy visitCatchPadInst(CatchPadInst &I)     { DELEGATE(FuncletPadInst); }
   RetTy visitFreezeInst(FreezeInst &I)         { DELEGATE(Instruction); }
 
-  // Handle the special intrinsic instruction classes.
-  RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgVariableIntrinsic(DbgVariableIntrinsic &I)
-                                                  { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgLabelInst(DbgLabelInst &I)        { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetPatternInst(MemSetPatternInst &I) {
     DELEGATE(IntrinsicInst);
@@ -286,9 +279,6 @@ class InstVisitor {
     if (const Function *F = I.getCalledFunction()) {
       switch (F->getIntrinsicID()) {
       default:                     DELEGATE(IntrinsicInst);
-      case Intrinsic::dbg_declare: DELEGATE(DbgDeclareInst);
-      case Intrinsic::dbg_value:   DELEGATE(DbgValueInst);
-      case Intrinsic::dbg_label:   DELEGATE(DbgLabelInst);
       case Intrinsic::memcpy:
       case Intrinsic::memcpy_inline:
         DELEGATE(MemCpyInst);
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index f4420f460741b..a99937a90cbb7 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -1041,6 +1041,10 @@ class LLVM_ABI Module {
 
   /// Returns target-abi from MDString, null if target-abi is absent.
   StringRef getTargetABIFromMD();
+
+  /// Get how unwind v2 (epilog) information should be generated for x64
+  /// Windows.
+  WinX64EHUnwindV2Mode getWinX64EHUnwindV2Mode() const;
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect the
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 051bcc147cb71..3e1531ebfd9d6 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -86,14 +87,39 @@ struct RuntimeLibcallsInfo {
     return ArrayRef(LibcallRoutineNames).drop_back();
   }
 
+  /// Get the comparison predicate that's to be used to test the result of the
+  /// comparison libcall against zero. This should only be used with
+  /// floating-point compare libcalls.
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return SoftFloatCompareLibcallPredicates[Call];
+  }
+
+  // FIXME: This should be removed. This should be private constant.
+  void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call,
+                                       CmpInst::Predicate Pred) {
+    SoftFloatCompareLibcallPredicates[Call] = Pred;
+  }
+
 private:
   /// Stores the name each libcall.
-  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
+  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1] = {nullptr};
+
+  static_assert(static_cast<int>(CallingConv::C) == 0,
+                "default calling conv should be encoded as 0");
 
   /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
+  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {};
 
-  static bool darwinHasSinCos(const Triple &TT) {
+  /// The condition type that should be used to test the result of each of the
+  /// soft floating-point comparison libcall against integer zero.
+  ///
+  // FIXME: This is only relevant for the handful of floating-point comparison
+  // runtime calls; it's excessive to have a table entry for every single
+  // opcode.
+  CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL];
+
+  static bool darwinHasSinCosStret(const Triple &TT) {
     assert(TT.isOSDarwin() && "should be called with darwin triple");
     // Don't bother with 32 bit x86.
     if (TT.getArch() == Triple::x86)
@@ -108,6 +134,14 @@ struct RuntimeLibcallsInfo {
     return true;
   }
 
+  /// Return true if the target has sincosf/sincos/sincosl functions
+  static bool hasSinCos(const Triple &TT) {
+    return TT.isGNUEnvironment() || TT.isOSFuchsia() ||
+           (TT.isAndroid() && !TT.isAndroidVersionLT(9));
+  }
+
+  void initSoftFloatCmpLibcallPredicates();
+
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
   LLVM_ABI void initLibcalls(const Triple &TT);
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 42610d505c2bd..1b5b1d5888824 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -14,318 +14,331 @@
 #ifndef LLVM_INITIALIZEPASSES_H
 #define LLVM_INITIALIZEPASSES_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class PassRegistry;
 
 /// Initialize all passes linked into the Core library.
-void initializeCore(PassRegistry &);
+LLVM_ABI void initializeCore(PassRegistry &);
 
 /// Initialize all passes linked into the TransformUtils library.
-void initializeTransformUtils(PassRegistry &);
+LLVM_ABI void initializeTransformUtils(PassRegistry &);
 
 /// Initialize all passes linked into the ScalarOpts library.
-void initializeScalarOpts(PassRegistry &);
+LLVM_ABI void initializeScalarOpts(PassRegistry &);
 
 /// Initialize all passes linked into the Vectorize library.
-void initializeVectorization(PassRegistry &);
+LLVM_ABI void initializeVectorization(PassRegistry &);
 
 /// Initialize all passes linked into the InstCombine library.
-void initializeInstCombine(PassRegistry &);
+LLVM_ABI void initializeInstCombine(PassRegistry &);
 
 /// Initialize all passes linked into the IPO library.
-void initializeIPO(PassRegistry &);
+LLVM_ABI void initializeIPO(PassRegistry &);
 
 /// Initialize all passes linked into the Analysis library.
-void initializeAnalysis(PassRegistry &);
+LLVM_ABI void initializeAnalysis(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeCodeGen(PassRegistry &);
+LLVM_ABI void initializeCodeGen(PassRegistry &);
 
 /// Initialize all passes linked into the GlobalISel library.
-void initializeGlobalISel(PassRegistry &);
+LLVM_ABI void initializeGlobalISel(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeTarget(PassRegistry &);
+LLVM_ABI void initializeTarget(PassRegistry &);
 
-void initializeAAResultsWrapperPassPass(PassRegistry &);
-void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
-void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
-void initializeAssumptionCacheTrackerPass(PassRegistry &);
-void initializeAtomicExpandLegacyPass(PassRegistry &);
-void initializeBasicBlockPathCloningPass(PassRegistry &);
-void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
-void initializeBasicBlockSectionsPass(PassRegistry &);
-void initializeBarrierNoopPass(PassRegistry &);
-void initializeBasicAAWrapperPassPass(PassRegistry &);
-void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeBranchFolderLegacyPass(PassRegistry &);
-void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeBranchRelaxationLegacyPass(PassRegistry &);
-void initializeBreakCriticalEdgesPass(PassRegistry &);
-void initializeBreakFalseDepsPass(PassRegistry &);
-void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
-void initializeCFGSimplifyPassPass(PassRegistry &);
-void initializeCFGuardPass(PassRegistry &);
-void initializeCFGuardLongjmpPass(PassRegistry &);
-void initializeCFIFixupPass(PassRegistry &);
-void initializeCFIInstrInserterPass(PassRegistry &);
-void initializeCallBrPreparePass(PassRegistry &);
-void initializeCallGraphDOTPrinterPass(PassRegistry &);
-void initializeCallGraphViewerPass(PassRegistry &);
-void initializeCallGraphWrapperPassPass(PassRegistry &);
-void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
-void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
-void initializeConstantHoistingLegacyPassPass(PassRegistry &);
-void initializeCycleInfoWrapperPassPass(PassRegistry &);
-void initializeDAEPass(PassRegistry &);
-void initializeDAHPass(PassRegistry &);
-void initializeDCELegacyPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
-void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
-void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
-void initializeDXILResourceWrapperPassPass(PassRegistry &);
-void initializeDeadMachineInstructionElimPass(PassRegistry &);
-void initializeDebugifyMachineModulePass(PassRegistry &);
-void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
-void initializeDetectDeadLanesLegacyPass(PassRegistry &);
-void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializeDomPrinterWrapperPassPass(PassRegistry &);
-void initializeDomViewerWrapperPassPass(PassRegistry &);
-void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
-void initializeDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
-void initializeEarlyCSELegacyPassPass(PassRegistry &);
-void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterLegacyPass(PassRegistry &);
-void initializeEarlyIfPredicatorPass(PassRegistry &);
-void initializeEarlyMachineLICMPass(PassRegistry &);
-void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
-void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
-void initializeEHContGuardTargetsPass(PassRegistry &);
-void initializeExpandFpLegacyPassPass(PassRegistry &);
-void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
-void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
-void initializeExpandPostRALegacyPass(PassRegistry &);
-void initializeExpandReductionsPass(PassRegistry &);
-void initializeExpandVariadicsPass(PassRegistry &);
-void initializeExternalAAWrapperPassPass(PassRegistry &);
-void initializeFEntryInserterLegacyPass(PassRegistry &);
-void initializeFinalizeISelPass(PassRegistry &);
-void initializeFinalizeMachineBundlesPass(PassRegistry &);
-void initializeFixIrreduciblePass(PassRegistry &);
-void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
-void initializeFlattenCFGLegacyPassPass(PassRegistry &);
-void initializeFuncletLayoutPass(PassRegistry &);
-void initializeGCEmptyBasicBlocksPass(PassRegistry &);
-void initializeGCMachineCodeAnalysisPass(PassRegistry &);
-void initializeGCModuleInfoPass(PassRegistry &);
-void initializeGVNLegacyPassPass(PassRegistry &);
-void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
-void initializeGlobalMergePass(PassRegistry &);
-void initializeGlobalsAAWrapperPassPass(PassRegistry &);
-void initializeHardwareLoopsLegacyPass(PassRegistry &);
-void initializeMIRProfileLoaderPassPass(PassRegistry &);
-void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
-void initializeIRTranslatorPass(PassRegistry &);
-void initializeIVUsersWrapperPassPass(PassRegistry &);
-void initializeIfConverterPass(PassRegistry &);
-void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeImplicitNullChecksPass(PassRegistry &);
-void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
-void initializeInferAddressSpacesPass(PassRegistry &);
-void initializeInstSimplifyLegacyPassPass(PassRegistry &);
-void initializeInstructionCombiningPassPass(PassRegistry &);
-void initializeInstructionSelectPass(PassRegistry &);
-void initializeInterleavedAccessPass(PassRegistry &);
-void initializeInterleavedLoadCombinePass(PassRegistry &);
-void initializeJMCInstrumenterPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeLCSSAVerificationPassPass(PassRegistry &);
-void initializeLCSSAWrapperPassPass(PassRegistry &);
-void initializeLazyBFIPassPass(PassRegistry &);
-void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
-void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
-void initializeLegacyLICMPassPass(PassRegistry &);
-void initializeLegalizerPass(PassRegistry &);
-void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
-void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
-void initializeLiveDebugValuesLegacyPass(PassRegistry &);
-void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
-void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
-void initializeLiveRangeShrinkPass(PassRegistry &);
-void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
-void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
-void initializeLiveVariablesWrapperPassPass(PassRegistry &);
-void initializeLoadStoreOptPass(PassRegistry &);
-void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
-void initializeLocalStackSlotPassPass(PassRegistry &);
-void initializeLocalizerPass(PassRegistry &);
-void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
-void initializeLoopExtractorLegacyPassPass(PassRegistry &);
-void initializeLoopInfoWrapperPassPass(PassRegistry &);
-void initializeLoopPassPass(PassRegistry &);
-void initializeLoopSimplifyPass(PassRegistry &);
-void initializeLoopStrengthReducePass(PassRegistry &);
-void initializeLoopTermFoldPass(PassRegistry &);
-void initializeLoopUnrollPass(PassRegistry &);
-void initializeLowerAtomicLegacyPassPass(PassRegistry &);
-void initializeLowerEmuTLSPass(PassRegistry &);
-void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
-void initializeLowerIntrinsicsPass(PassRegistry &);
-void initializeLowerInvokeLegacyPassPass(PassRegistry &);
-void initializeLowerSwitchLegacyPassPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
-void initializeMIRCanonicalizerPass(PassRegistry &);
-void initializeMIRNamerPass(PassRegistry &);
-void initializeMIRPrintingPassPass(PassRegistry &);
-void initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
-void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
-void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeMachineCFGPrinterPass(PassRegistry &);
-void initializeMachineCSELegacyPass(PassRegistry &);
-void initializeMachineCombinerPass(PassRegistry &);
-void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineDominanceFrontierPass(PassRegistry &);
-void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineFunctionPrinterPassPass(PassRegistry &);
-void initializeMachineFunctionSplitterPass(PassRegistry &);
-void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
-void initializeMachineLICMPass(PassRegistry &);
-void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
-void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
-void initializeMachineOutlinerPass(PassRegistry &);
-void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
-void initializeStaticDataAnnotatorPass(PassRegistry &);
-void initializeMachinePipelinerPass(PassRegistry &);
-void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineRegionInfoPassPass(PassRegistry &);
-void initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
-void initializeMachineSchedulerLegacyPass(PassRegistry &);
-void initializeMachineSinkingLegacyPass(PassRegistry &);
-void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
-void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
-void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
-void initializeMachineVerifierLegacyPassPass(PassRegistry &);
-void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
-void initializeMemorySSAWrapperPassPass(PassRegistry &);
-void initializeMergeICmpsLegacyPassPass(PassRegistry &);
-void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeModuloScheduleTestPass(PassRegistry &);
-void initializeNaryReassociateLegacyPassPass(PassRegistry &);
-void initializeObjCARCContractLegacyPassPass(PassRegistry &);
-void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
-void initializeOptimizePHIsLegacyPass(PassRegistry &);
-void initializePEILegacyPass(PassRegistry &);
-void initializePHIEliminationPass(PassRegistry &);
-void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
-void initializePatchableFunctionLegacyPass(PassRegistry &);
-void initializePeepholeOptimizerLegacyPass(PassRegistry &);
-void initializePhiValuesWrapperPassPass(PassRegistry &);
-void initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
-void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
-void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializePostDomPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomViewerWrapperPassPass(PassRegistry &);
-void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
-void initializePostMachineSchedulerLegacyPass(PassRegistry &);
-void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
-void initializePostRAMachineSinkingPass(PassRegistry &);
-void initializePostRASchedulerLegacyPass(PassRegistry &);
-void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
-void initializePrintFunctionPassWrapperPass(PassRegistry &);
-void initializePrintModulePassWrapperPass(PassRegistry &);
-void initializeProcessImplicitDefsPass(PassRegistry &);
-void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
-void initializePromoteLegacyPassPass(PassRegistry &);
-void initializeRABasicPass(PassRegistry &);
-void initializePseudoProbeInserterPass(PassRegistry &);
-void initializeRAGreedyLegacyPass(PassRegistry &);
-void initializeReachingDefAnalysisPass(PassRegistry &);
-void initializeReassociateLegacyPassPass(PassRegistry &);
-void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocFastPass(PassRegistry &);
-void initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocScoringPass(PassRegistry &);
-void initializeRegBankSelectPass(PassRegistry &);
-void initializeRegToMemWrapperPassPass(PassRegistry &);
-void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
-void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
-void initializeRegionInfoPassPass(PassRegistry &);
-void initializeRegionOnlyPrinterPass(PassRegistry &);
-void initializeRegionOnlyViewerPass(PassRegistry &);
-void initializeRegionPrinterPass(PassRegistry &);
-void initializeRegionViewerPass(PassRegistry &);
-void initializeRegisterCoalescerLegacyPass(PassRegistry &);
-void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
-void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
-void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
-void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
-void initializeResetMachineFunctionPass(PassRegistry &);
-void initializeSCEVAAWrapperPassPass(PassRegistry &);
-void initializeSROALegacyPassPass(PassRegistry &);
-void initializeSafeStackLegacyPassPass(PassRegistry &);
-void initializeSafepointIRVerifierPass(PassRegistry &);
-void initializeSelectOptimizePass(PassRegistry &);
-void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
-void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
-void initializeScalarizerLegacyPassPass(PassRegistry &);
-void initializeScavengerTestPass(PassRegistry &);
-void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
-void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
-void initializeShadowStackGCLoweringPass(PassRegistry &);
-void initializeShrinkWrapLegacyPass(PassRegistry &);
-void initializeSingleLoopExtractorPass(PassRegistry &);
-void initializeSinkingLegacyPassPass(PassRegistry &);
-void initializeSjLjEHPreparePass(PassRegistry &);
-void initializeSlotIndexesWrapperPassPass(PassRegistry &);
-void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
-void initializeStackColoringLegacyPass(PassRegistry &);
-void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
-void initializeStaticDataSplitterPass(PassRegistry &);
-void initializeStackMapLivenessPass(PassRegistry &);
-void initializeStackProtectorPass(PassRegistry &);
-void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
-void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
-void initializeStackSlotColoringLegacyPass(PassRegistry &);
-void initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
-void initializeStripDebugMachineModulePass(PassRegistry &);
-void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
-void initializeTailCallElimPass(PassRegistry &);
-void initializeTailDuplicateLegacyPass(PassRegistry &);
-void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
-void initializeTargetPassConfigPass(PassRegistry &);
-void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
-void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
-void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
-void initializeTypePromotionLegacyPass(PassRegistry &);
-void initializeInitUndefPass(PassRegistry &);
-void initializeUniformityInfoWrapperPassPass(PassRegistry &);
-void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
-void initializeUnpackMachineBundlesPass(PassRegistry &);
-void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
-void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
-void initializeVerifierLegacyPassPass(PassRegistry &);
-void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
-void initializeVirtRegRewriterLegacyPass(PassRegistry &);
-void initializeWasmEHPreparePass(PassRegistry &);
-void initializeWinEHPreparePass(PassRegistry &);
-void initializeWriteBitcodePassPass(PassRegistry &);
-void initializeXRayInstrumentationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeAAResultsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &);
+LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &);
+LLVM_ABI void
+initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockSectionsPass(PassRegistry &);
+LLVM_ABI void initializeBarrierNoopPass(PassRegistry &);
+LLVM_ABI void initializeBasicAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchFolderLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchRelaxationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBreakCriticalEdgesPass(PassRegistry &);
+LLVM_ABI void initializeBreakFalseDepsPass(PassRegistry &);
+LLVM_ABI void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
+LLVM_ABI void initializeCFGSimplifyPassPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardLongjmpPass(PassRegistry &);
+LLVM_ABI void initializeCFIFixupPass(PassRegistry &);
+LLVM_ABI void initializeCFIInstrInserterPass(PassRegistry &);
+LLVM_ABI void initializeCallBrPreparePass(PassRegistry &);
+LLVM_ABI void initializeCallGraphDOTPrinterPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphViewerPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeCheckDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeConstantHoistingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDAEPass(PassRegistry &);
+LLVM_ABI void initializeDAHPass(PassRegistry &);
+LLVM_ABI void initializeDCELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
+LLVM_ABI void initializeDebugifyMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDetectDeadLanesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfConverterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfPredicatorPass(PassRegistry &);
+LLVM_ABI void initializeEarlyMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEHContGuardTargetsPass(PassRegistry &);
+LLVM_ABI void initializeExpandFpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandPostRALegacyPass(PassRegistry &);
+LLVM_ABI void initializeExpandReductionsPass(PassRegistry &);
+LLVM_ABI void initializeExpandVariadicsPass(PassRegistry &);
+LLVM_ABI void initializeExternalAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeFEntryInserterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeISelPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeFixIrreduciblePass(PassRegistry &);
+LLVM_ABI void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFlattenCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeFuncletLayoutPass(PassRegistry &);
+LLVM_ABI void initializeGCEmptyBasicBlocksPass(PassRegistry &);
+LLVM_ABI void initializeGCMachineCodeAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeGCModuleInfoPass(PassRegistry &);
+LLVM_ABI void initializeGVNLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergePass(PassRegistry &);
+LLVM_ABI void initializeGlobalsAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeHardwareLoopsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMIRProfileLoaderPassPass(PassRegistry &);
+LLVM_ABI void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIRTranslatorPass(PassRegistry &);
+LLVM_ABI void initializeIVUsersWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIfConverterPass(PassRegistry &);
+LLVM_ABI void
+initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeImplicitNullChecksPass(PassRegistry &);
+LLVM_ABI void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInferAddressSpacesPass(PassRegistry &);
+LLVM_ABI void initializeInstSimplifyLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionCombiningPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionSelectPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedAccessPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedLoadCombinePass(PassRegistry &);
+LLVM_ABI void initializeJMCInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializeKCFIPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAVerificationPassPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBFIPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLegacyLICMPassPass(PassRegistry &);
+LLVM_ABI void initializeLegalizerPass(PassRegistry &);
+LLVM_ABI void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLiveRangeShrinkPass(PassRegistry &);
+LLVM_ABI void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveVariablesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreOptPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalStackSlotPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalizerPass(PassRegistry &);
+LLVM_ABI void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopExtractorLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopSimplifyPass(PassRegistry &);
+LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &);
+LLVM_ABI void initializeLoopTermFoldPass(PassRegistry &);
+LLVM_ABI void initializeLoopUnrollPass(PassRegistry &);
+LLVM_ABI void initializeLowerAtomicLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerEmuTLSPass(PassRegistry &);
+LLVM_ABI void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerIntrinsicsPass(PassRegistry &);
+LLVM_ABI void initializeLowerInvokeLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerSwitchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
+LLVM_ABI void initializeMIRCanonicalizerPass(PassRegistry &);
+LLVM_ABI void initializeMIRNamerPass(PassRegistry &);
+LLVM_ABI void initializeMIRPrintingPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineCFGPrinterPass(PassRegistry &);
+LLVM_ABI void initializeMachineCSELegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCombinerPass(PassRegistry &);
+LLVM_ABI void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominanceFrontierPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionSplitterPass(PassRegistry &);
+LLVM_ABI void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineOutlinerPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataAnnotatorPass(PassRegistry &);
+LLVM_ABI void initializeMachinePipelinerPass(PassRegistry &);
+LLVM_ABI void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSinkingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMemorySSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMergeICmpsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeModuloScheduleTestPass(PassRegistry &);
+LLVM_ABI void initializeNaryReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeObjCARCContractLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &);
+LLVM_ABI void initializePEILegacyPass(PassRegistry &);
+LLVM_ABI void initializePHIEliminationPass(PassRegistry &);
+LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &);
+LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePhiValuesWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializePostMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAMachineSinkingPass(PassRegistry &);
+LLVM_ABI void initializePostRASchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePrintFunctionPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializePrintModulePassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeProcessImplicitDefsPass(PassRegistry &);
+LLVM_ABI void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePromoteLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeRABasicPass(PassRegistry &);
+LLVM_ABI void initializePseudoProbeInserterPass(PassRegistry &);
+LLVM_ABI void initializeRAGreedyLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReachingDefAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocFastPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocScoringPass(PassRegistry &);
+LLVM_ABI void initializeRegBankSelectPass(PassRegistry &);
+LLVM_ABI void initializeRegToMemWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegionPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegisterCoalescerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
+LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &);
+LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafepointIRVerifierPass(PassRegistry &);
+LLVM_ABI void initializeSelectOptimizePass(PassRegistry &);
+LLVM_ABI void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScavengerTestPass(PassRegistry &);
+LLVM_ABI void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeShadowStackGCLoweringPass(PassRegistry &);
+LLVM_ABI void initializeShrinkWrapLegacyPass(PassRegistry &);
+LLVM_ABI void initializeSingleLoopExtractorPass(PassRegistry &);
+LLVM_ABI void initializeSinkingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSjLjEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeSlotIndexesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackColoringLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataSplitterPass(PassRegistry &);
+LLVM_ABI void initializeStackMapLivenessPass(PassRegistry &);
+LLVM_ABI void initializeStackProtectorPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSlotColoringLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeStripDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTailCallElimPass(PassRegistry &);
+LLVM_ABI void initializeTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTargetPassConfigPass(PassRegistry &);
+LLVM_ABI void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &);
+LLVM_ABI void initializeInitUndefPass(PassRegistry &);
+LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegRewriterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeWasmEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWinEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWriteBitcodePassPass(PassRegistry &);
+LLVM_ABI void initializeXRayInstrumentationLegacyPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index a7bf1b965bf2d..93ce3cc444213 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -715,7 +715,10 @@ class LLVM_ABI MCAsmInfo {
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
 
   void printExpr(raw_ostream &, const MCExpr &) const;
-  virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
+  virtual void printSpecifierExpr(raw_ostream &,
+                                  const MCSpecifierExpr &) const {
+    llvm_unreachable("Need to implement hook if target uses MCSpecifierExpr");
+  }
   virtual bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &Res,
                                          const MCAssembler *Asm) const;
 };
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index cd57fafc50b56..4ec780d8ff94f 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -512,7 +512,6 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
 
   explicit MCSpecifierExpr(const MCExpr *Expr, Spec S, SMLoc Loc = SMLoc())
       : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
-  virtual ~MCSpecifierExpr() = default;
 
 public:
   LLVM_ABI static const MCSpecifierExpr *
@@ -523,12 +522,6 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
-  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-    llvm_unreachable("Replace MCExpr::print calls with MCAsmInfo::printExpr");
-  }
-  virtual bool evaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAssembler *Asm) const;
-
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Specifier;
   }
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index a0dc522e13cab..8d7545144dfd9 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -256,8 +256,10 @@ class ELFFile {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  // Default ctor required to instantiate the template for DLL export.
+  // Default ctor and copy assignment operator required to instantiate the
+  // template for DLL export.
   ELFFile(const ELFFile &) = default;
+  ELFFile &operator=(const ELFFile &) = default;
 
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
index 6c712956dfb5d..4e7984c54a72a 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
@@ -19,6 +19,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -108,23 +109,24 @@ struct InlineeInfo {
 };
 
 struct YAMLDebugSubsection {
-  static Expected<YAMLDebugSubsection>
+  LLVM_ABI static Expected<YAMLDebugSubsection>
   fromCodeViewSubection(const codeview::StringsAndChecksumsRef &SC,
                         const codeview::DebugSubsectionRecord &SS);
 
   std::shared_ptr<detail::YAMLSubsectionBase> Subsection;
 };
 
-Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
+LLVM_ABI Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
 toCodeViewSubsectionList(BumpPtrAllocator &Allocator,
                          ArrayRef<YAMLDebugSubsection> Subsections,
                          const codeview::StringsAndChecksums &SC);
 
-std::vector<YAMLDebugSubsection>
+LLVM_ABI std::vector<YAMLDebugSubsection>
 fromDebugS(ArrayRef<uint8_t> Data, const codeview::StringsAndChecksumsRef &SC);
 
-void initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
-                                   codeview::StringsAndChecksums &SC);
+LLVM_ABI void
+initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
+                              codeview::StringsAndChecksums &SC);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
index 7c05c9eea05ed..dccc77dc1a0c5 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
@@ -16,6 +16,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <memory>
@@ -32,11 +33,12 @@ struct SymbolRecordBase;
 struct SymbolRecord {
   std::shared_ptr<detail::SymbolRecordBase> Symbol;
 
-  codeview::CVSymbol
+  LLVM_ABI codeview::CVSymbol
   toCodeViewSymbol(BumpPtrAllocator &Allocator,
                    codeview::CodeViewContainer Container) const;
 
-  static Expected<SymbolRecord> fromCodeViewSymbol(codeview::CVSymbol Symbol);
+  LLVM_ABI static Expected<SymbolRecord>
+  fromCodeViewSymbol(codeview::CVSymbol Symbol);
 };
 
 } // end namespace CodeViewYAML
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index 04b5e0ba3aa1a..3c239ce507dfc 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -45,15 +46,16 @@ struct MemberRecord {
 struct LeafRecord {
   std::shared_ptr<detail::LeafRecordBase> Leaf;
 
-  codeview::CVType
+  LLVM_ABI codeview::CVType
   toCodeViewRecord(codeview::AppendingTypeTableBuilder &Serializer) const;
-  static Expected<LeafRecord> fromCodeViewRecord(codeview::CVType Type);
+  LLVM_ABI static Expected<LeafRecord>
+  fromCodeViewRecord(codeview::CVType Type);
 };
 
-std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
-                                   StringRef SectionName);
-ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc,
-                           StringRef SectionName);
+LLVM_ABI std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
+                                            StringRef SectionName);
+LLVM_ABI ArrayRef<uint8_t>
+toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc, StringRef SectionName);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 5e1b88f4fef64..050ff60bcd408 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TargetParser/Host.h"
@@ -27,26 +28,26 @@ namespace DWARFYAML {
 
 struct Data;
 
-Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
-Error emitDebugStr(raw_ostream &OS, const Data &DI);
-
-Error emitDebugAranges(raw_ostream &OS, const Data &DI);
-Error emitDebugRanges(raw_ostream &OS, const Data &DI);
-Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugInfo(raw_ostream &OS, const Data &DI);
-Error emitDebugLine(raw_ostream &OS, const Data &DI);
-Error emitDebugAddr(raw_ostream &OS, const Data &DI);
-Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
-Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
-Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
-Error emitDebugNames(raw_ostream &OS, const Data &DI);
-
-std::function<Error(raw_ostream &, const Data &)>
+LLVM_ABI Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStr(raw_ostream &OS, const Data &DI);
+
+LLVM_ABI Error emitDebugAranges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRanges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugInfo(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLine(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugAddr(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugNames(raw_ostream &OS, const Data &DI);
+
+LLVM_ABI std::function<Error(raw_ostream &, const Data &)>
 getDWARFEmitterByName(StringRef SecName);
-Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
+LLVM_ABI Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
 emitDebugSections(StringRef YAMLString,
                   bool IsLittleEndian = sys::IsLittleEndianHost,
                   bool Is64BitAddrSize = true);
diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 69f8c4f27d7a3..c8528686592ab 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 #include <optional>
@@ -255,16 +256,16 @@ struct Data {
   std::optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
   std::optional<DebugNamesSection> DebugNames;
 
-  bool isEmpty() const;
+  LLVM_ABI bool isEmpty() const;
 
-  SetVector<StringRef> getNonEmptySectionNames() const;
+  LLVM_ABI SetVector<StringRef> getNonEmptySectionNames() const;
 
   struct AbbrevTableInfo {
     uint64_t Index;
     uint64_t Offset;
   };
-  Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
-  StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
+  LLVM_ABI Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
+  LLVM_ABI StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
 
 private:
   mutable std::unordered_map<uint64_t, AbbrevTableInfo> AbbrevTableInfoMap;
@@ -310,88 +311,90 @@ namespace llvm {
 namespace yaml {
 
 template <> struct MappingTraits<DWARFYAML::Data> {
-  static void mapping(IO &IO, DWARFYAML::Data &DWARF);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Data &DWARF);
 };
 
 template <> struct MappingTraits<DWARFYAML::AbbrevTable> {
-  static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::Abbrev> {
-  static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::AttributeAbbrev> {
-  static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARangeDescriptor> {
-  static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARange> {
-  static void mapping(IO &IO, DWARFYAML::ARange &ARange);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARange &ARange);
 };
 
 template <> struct MappingTraits<DWARFYAML::RangeEntry> {
-  static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::Ranges> {
-  static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubEntry> {
-  static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubSection> {
-  static void mapping(IO &IO, DWARFYAML::PubSection &Section);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubSection &Section);
 };
 
 template <> struct MappingTraits<DWARFYAML::Unit> {
-  static void mapping(IO &IO, DWARFYAML::Unit &Unit);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Unit &Unit);
 };
 
 template <> struct MappingTraits<DWARFYAML::DebugNamesSection> {
-  static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameEntry> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameAbbreviation> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
 };
 template <> struct MappingTraits<DWARFYAML::IdxForm> {
-  static void mapping(IO &IO, DWARFYAML::IdxForm &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::IdxForm &);
 };
 
 template <> struct MappingTraits<DWARFYAML::Entry> {
-  static void mapping(IO &IO, DWARFYAML::Entry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Entry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::FormValue> {
-  static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
 };
 
 template <> struct MappingTraits<DWARFYAML::File> {
-  static void mapping(IO &IO, DWARFYAML::File &File);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::File &File);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTableOpcode> {
-  static void mapping(IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::LineTableOpcode &LineTableOpcode);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTable> {
-  static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::SegAddrPair> {
-  static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
 };
 
 template <> struct MappingTraits<DWARFYAML::DWARFOperation> {
-  static void mapping(IO &IO, DWARFYAML::DWARFOperation &DWARFOperation);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::DWARFOperation &DWARFOperation);
 };
 
 template <typename EntryType>
@@ -407,19 +410,20 @@ struct MappingTraits<DWARFYAML::ListEntries<EntryType>> {
 };
 
 template <> struct MappingTraits<DWARFYAML::RnglistEntry> {
-  static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::LoclistEntry> {
-  static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::AddrTableEntry> {
-  static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::StringOffsetsTable> {
-  static void mapping(IO &IO, DWARFYAML::StringOffsetsTable &StrOffsetsTable);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::StringOffsetsTable &StrOffsetsTable);
 };
 
 template <> struct ScalarEnumerationTraits<dwarf::DwarfFormat> {
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 8a0dfd8718796..c235112dacf7c 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/Object/DXContainer.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <array>
 #include <optional>
@@ -59,14 +60,14 @@ struct DXILProgram {
 #define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) bool Val = false;
 struct ShaderFeatureFlags {
   ShaderFeatureFlags() = default;
-  ShaderFeatureFlags(uint64_t FlagData);
-  uint64_t getEncodedFlags();
+  LLVM_ABI ShaderFeatureFlags(uint64_t FlagData);
+  LLVM_ABI uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
 
 struct ShaderHash {
   ShaderHash() = default;
-  ShaderHash(const dxbc::ShaderHash &Data);
+  LLVM_ABI ShaderHash(const dxbc::ShaderHash &Data);
 
   bool IncludesSource;
   std::vector<llvm::yaml::Hex8> Digest;
@@ -84,7 +85,7 @@ struct RootDescriptorYaml {
   uint32_t ShaderRegister;
   uint32_t RegisterSpace;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define ROOT_DESCRIPTOR_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -97,7 +98,7 @@ struct DescriptorRangeYaml {
   uint32_t RegisterSpace;
   uint32_t OffsetInDescriptorsFromTableStart;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define DESCRIPTOR_RANGE_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -193,13 +194,13 @@ struct RootSignatureYamlDesc {
   RootParameterYamlDesc Parameters;
   SmallVector<StaticSamplerYamlDesc> StaticSamplers;
 
-  uint32_t getEncodedFlags();
+  LLVM_ABI uint32_t getEncodedFlags();
 
   iterator_range<StaticSamplerYamlDesc *> samplers() {
     return make_range(StaticSamplers.begin(), StaticSamplers.end());
   }
 
-  static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
+  LLVM_ABI static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
   create(const object::DirectX::RootSignature &Data);
 
 #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false;
@@ -258,13 +259,13 @@ struct PSVInfo {
 
   StringRef EntryName;
 
-  void mapInfoForVersion(yaml::IO &IO);
+  LLVM_ABI void mapInfoForVersion(yaml::IO &IO);
 
-  PSVInfo();
-  PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
-  PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
+  LLVM_ABI PSVInfo();
+  LLVM_ABI PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
 };
 
 struct SignatureParameter {
@@ -328,88 +329,96 @@ class raw_ostream;
 namespace yaml {
 
 template <> struct MappingTraits<DXContainerYAML::VersionTuple> {
-  static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::FileHeader> {
-  static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
 };
 
 template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
-  static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO,
+                               DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
 };
 
 template <> struct MappingTraits<DXContainerYAML::PSVInfo> {
-  static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Part> {
-  static void mapping(IO &IO, DXContainerYAML::Part &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Part &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Object> {
-  static void mapping(IO &IO, DXContainerYAML::Object &Obj);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Object &Obj);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceBindInfo> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureElement> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureElement &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureElement &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureParameter> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureParameter &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureParameter &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Signature> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
+  LLVM_ABI static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      DXContainerYAML::RootSignatureYamlDesc &RootSignature);
+  LLVM_ABI static void
+  mapping(IO &IO, DXContainerYAML::RootSignatureYamlDesc &RootSignature);
 };
 
 template <>
 struct MappingContextTraits<DXContainerYAML::RootParameterLocationYaml,
                             DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      llvm::DXContainerYAML::RootParameterLocationYaml &L,
-                      DXContainerYAML::RootSignatureYamlDesc &S);
+  LLVM_ABI static void
+  mapping(IO &IO, llvm::DXContainerYAML::RootParameterLocationYaml &L,
+          DXContainerYAML::RootSignatureYamlDesc &S);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootConstantsYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootConstantsYaml &C);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootConstantsYaml &C);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootDescriptorYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootDescriptorYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootDescriptorYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorTableYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorTableYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorTableYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorRangeYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorRangeYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorRangeYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::StaticSamplerYamlDesc> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
 };
 
 } // namespace yaml
diff --git a/llvm/include/llvm/ObjectYAML/YAML.h b/llvm/include/llvm/ObjectYAML/YAML.h
index 3bf6527a7e2da..709520c934d7d 100644
--- a/llvm/include/llvm/ObjectYAML/YAML.h
+++ b/llvm/include/llvm/ObjectYAML/YAML.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 
@@ -86,13 +87,13 @@ class BinaryRef {
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as binary to the given raw_ostream.
   /// N can be used to specify the maximum number of bytes.
-  void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
+  LLVM_ABI void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
 
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as hex to the given raw_ostream.
   ///
   /// For example, a possible output could be `DEADBEEFCAFEBABE`.
-  void writeAsHex(raw_ostream &OS) const;
+  LLVM_ABI void writeAsHex(raw_ostream &OS) const;
 };
 
 inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
@@ -104,8 +105,8 @@ inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
 }
 
 template <> struct ScalarTraits<BinaryRef> {
-  static void output(const BinaryRef &, void *, raw_ostream &);
-  static StringRef input(StringRef, void *, BinaryRef &);
+  LLVM_ABI static void output(const BinaryRef &, void *, raw_ostream &);
+  LLVM_ABI static StringRef input(StringRef, void *, BinaryRef &);
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
diff --git a/llvm/include/llvm/ObjectYAML/yaml2obj.h b/llvm/include/llvm/ObjectYAML/yaml2obj.h
index 3b458c3cd890b..4c9084b790507 100644
--- a/llvm/include/llvm/ObjectYAML/yaml2obj.h
+++ b/llvm/include/llvm/ObjectYAML/yaml2obj.h
@@ -12,6 +12,7 @@
 #define LLVM_OBJECTYAML_YAML2OBJ_H
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 namespace llvm {
@@ -66,25 +67,32 @@ struct YamlObjectFile;
 
 using ErrorHandler = llvm::function_ref<void(const Twine &Msg)>;
 
-bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
-              uint64_t MaxSize);
-bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
-                   ErrorHandler EH);
-bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
-                      ErrorHandler EH);
-
-bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
-                 unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
+LLVM_ABI bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
+                       uint64_t MaxSize);
+LLVM_ABI bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
+                            ErrorHandler EH);
+LLVM_ABI bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
+                               ErrorHandler EH);
+
+LLVM_ABI bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
+                          unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
 
 /// Convenience function for tests.
-std::unique_ptr<object::ObjectFile>
+LLVM_ABI std::unique_ptr<object::ObjectFile>
 yaml2ObjectFile(SmallVectorImpl<char> &Storage, StringRef Yaml,
                 ErrorHandler ErrHandler);
 
diff --git a/llvm/include/llvm/Option/OptSpecifier.h b/llvm/include/llvm/Option/OptSpecifier.h
index dc6acae7fc002..cb87fbd17ec1c 100644
--- a/llvm/include/llvm/Option/OptSpecifier.h
+++ b/llvm/include/llvm/Option/OptSpecifier.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_OPTION_OPTSPECIFIER_H
 #define LLVM_OPTION_OPTSPECIFIER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 namespace opt {
 
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 921db0b5f7aec..58c45e75b3f0a 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -31,6 +31,7 @@
 #ifdef EXPENSIVE_CHECKS
 #include <cstdint>
 #endif
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -95,7 +96,7 @@ const char *to_string(ThinOrFullLTOPhase Phase);
 /// interprocedural optimization or you do not fit into any of the more
 /// constrained passes described below.
 ///
-class Pass {
+class LLVM_ABI Pass {
   AnalysisResolver *Resolver = nullptr;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
@@ -252,7 +253,7 @@ class Pass {
 /// interprocedural optimizations and analyses.  ModulePasses may do anything
 /// they want to the program.
 ///
-class ModulePass : public Pass {
+class LLVM_ABI ModulePass : public Pass {
 public:
   explicit ModulePass(char &pid) : Pass(PT_Module, pid) {}
 
@@ -282,7 +283,7 @@ class ModulePass : public Pass {
 /// ImmutablePass class - This class is used to provide information that does
 /// not need to be run.  This is useful for things like target information.
 ///
-class ImmutablePass : public ModulePass {
+class LLVM_ABI ImmutablePass : public ModulePass {
 public:
   explicit ImmutablePass(char &pid) : ModulePass(pid) {}
 
@@ -311,7 +312,7 @@ class ImmutablePass : public ModulePass {
 ///  2. Optimizing a function does not cause the addition or removal of any
 ///     functions in the module
 ///
-class FunctionPass : public Pass {
+class LLVM_ABI FunctionPass : public Pass {
 public:
   explicit FunctionPass(char &pid) : Pass(PT_Function, pid) {}
 
@@ -338,13 +339,13 @@ class FunctionPass : public Pass {
 /// If the user specifies the -time-passes argument on an LLVM tool command line
 /// then the value of this boolean will be true, otherwise false.
 /// This is the storage for the -time-passes option.
-extern bool TimePassesIsEnabled;
+LLVM_ABI extern bool TimePassesIsEnabled;
 /// If TimePassesPerRun is true, there would be one line of report for
 /// each pass invocation.
 /// If TimePassesPerRun is false, there would be only one line of
 /// report for each pass (even there are more than one pass objects).
 /// (For new pass manager only)
-extern bool TimePassesPerRun;
+LLVM_ABI extern bool TimePassesPerRun;
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/PassAnalysisSupport.h b/llvm/include/llvm/PassAnalysisSupport.h
index 4bed3cb55a901..02abb00b66b52 100644
--- a/llvm/include/llvm/PassAnalysisSupport.h
+++ b/llvm/include/llvm/PassAnalysisSupport.h
@@ -24,6 +24,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <tuple>
 #include <utility>
@@ -69,14 +70,14 @@ class AnalysisUsage {
 
   ///@{
   /// Add the specified ID to the required set of the usage info for a pass.
-  AnalysisUsage &addRequiredID(const void *ID);
-  AnalysisUsage &addRequiredID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(const void *ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequired() {
     return addRequiredID(PassClass::ID);
   }
 
-  AnalysisUsage &addRequiredTransitiveID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredTransitiveID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequiredTransitive() {
     return addRequiredTransitiveID(PassClass::ID);
@@ -124,7 +125,7 @@ class AnalysisUsage {
   /// preserved by this pass. If no such Pass exists, do nothing. This can be
   /// useful when a pass is trivially preserved, but may not be linked in. Be
   /// careful about spelling!
-  AnalysisUsage &addPreserved(StringRef Arg);
+  LLVM_ABI AnalysisUsage &addPreserved(StringRef Arg);
 
   /// Set by analyses that do not transform their input at all
   void setPreservesAll() { PreservesAll = true; }
@@ -139,7 +140,7 @@ class AnalysisUsage {
   ///
   /// This function annotates the AnalysisUsage info object to say that analyses
   /// that only depend on the CFG are preserved by this pass.
-  void setPreservesCFG();
+  LLVM_ABI void setPreservesCFG();
 
   const VectorType &getRequiredSet() const { return Required; }
   const VectorType &getRequiredTransitiveSet() const {
@@ -174,7 +175,8 @@ class AnalysisResolver {
   }
 
   /// Find pass that is implementing PI. Initialize pass for Function F.
-  std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI, Function &F);
+  LLVM_ABI std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI,
+                                                 Function &F);
 
   void addAnalysisImplsPair(AnalysisID PI, Pass *P) {
     if (findImplPass(PI) == P)
@@ -189,7 +191,7 @@ class AnalysisResolver {
   }
 
   /// Return analysis result or null if it doesn't exist.
-  Pass *getAnalysisIfAvailable(AnalysisID ID) const;
+  LLVM_ABI Pass *getAnalysisIfAvailable(AnalysisID ID) const;
 
 private:
   /// This keeps track of which passes implements the interfaces that are
diff --git a/llvm/include/llvm/PassRegistry.h b/llvm/include/llvm/PassRegistry.h
index 003c0ac4c374b..f3dada0c0ba6c 100644
--- a/llvm/include/llvm/PassRegistry.h
+++ b/llvm/include/llvm/PassRegistry.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/RWMutex.h"
 #include <memory>
 #include <vector>
@@ -49,36 +50,36 @@ class PassRegistry {
 
 public:
   PassRegistry() = default;
-  ~PassRegistry();
+  LLVM_ABI ~PassRegistry();
 
   /// getPassRegistry - Access the global registry object, which is
   /// automatically initialized at application launch and destroyed by
   /// llvm_shutdown.
-  static PassRegistry *getPassRegistry();
+  LLVM_ABI static PassRegistry *getPassRegistry();
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// type identifier (&MyPass::ID).
-  const PassInfo *getPassInfo(const void *TI) const;
+  LLVM_ABI const PassInfo *getPassInfo(const void *TI) const;
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// argument string.
-  const PassInfo *getPassInfo(StringRef Arg) const;
+  LLVM_ABI const PassInfo *getPassInfo(StringRef Arg) const;
 
   /// registerPass - Register a pass (by means of its PassInfo) with the
   /// registry.  Required in order to use the pass with a PassManager.
-  void registerPass(const PassInfo &PI, bool ShouldFree = false);
+  LLVM_ABI void registerPass(const PassInfo &PI, bool ShouldFree = false);
 
   /// enumerateWith - Enumerate the registered passes, calling the provided
   /// PassRegistrationListener's passEnumerate() callback on each of them.
-  void enumerateWith(PassRegistrationListener *L);
+  LLVM_ABI void enumerateWith(PassRegistrationListener *L);
 
   /// addRegistrationListener - Register the given PassRegistrationListener
   /// to receive passRegistered() callbacks whenever a new pass is registered.
-  void addRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void addRegistrationListener(PassRegistrationListener *L);
 
   /// removeRegistrationListener - Unregister a PassRegistrationListener so that
   /// it no longer receives passRegistered() callbacks.
-  void removeRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void removeRegistrationListener(PassRegistrationListener *L);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/PassSupport.h b/llvm/include/llvm/PassSupport.h
index b0897a6be37d1..7f0306e33e832 100644
--- a/llvm/include/llvm/PassSupport.h
+++ b/llvm/include/llvm/PassSupport.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Threading.h"
 #include <functional>
@@ -112,7 +113,7 @@ struct PassRegistrationListener {
 
   /// enumeratePasses - Iterate over the registered passes, calling the
   /// passEnumerate callback on each PassInfo object.
-  void enumeratePasses();
+  LLVM_ABI void enumeratePasses();
 
   /// passEnumerate - Callback function invoked when someone calls
   /// enumeratePasses on this PassRegistrationListener object.
diff --git a/llvm/include/llvm/Passes/OptimizationLevel.h b/llvm/include/llvm/Passes/OptimizationLevel.h
index d2c3fde4935fb..1cf258f1ffd0d 100644
--- a/llvm/include/llvm/Passes/OptimizationLevel.h
+++ b/llvm/include/llvm/Passes/OptimizationLevel.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_PASSES_OPTIMIZATIONLEVEL_H
 #define LLVM_PASSES_OPTIMIZATIONLEVEL_H
 
+#include "llvm/Support/Compiler.h"
 #include <assert.h>
 
 namespace llvm {
@@ -38,7 +39,7 @@ class OptimizationLevel final {
   /// Disable as many optimizations as possible. This doesn't completely
   /// disable the optimizer in all cases, for example always_inline functions
   /// can be required to be inlined for correctness.
-  static const OptimizationLevel O0;
+  LLVM_ABI static const OptimizationLevel O0;
 
   /// Optimize quickly without destroying debuggability.
   ///
@@ -54,7 +55,7 @@ class OptimizationLevel final {
   /// vectorization, or fusion don't make sense here due to the degree to
   /// which the executed code differs from the source code, and the compile
   /// time cost.
-  static const OptimizationLevel O1;
+  LLVM_ABI static const OptimizationLevel O1;
   /// Optimize for fast execution as much as possible without triggering
   /// significant incremental compile time or code size growth.
   ///
@@ -71,7 +72,7 @@ class OptimizationLevel final {
   ///
   /// This is expected to be a good default optimization level for the vast
   /// majority of users.
-  static const OptimizationLevel O2;
+  LLVM_ABI static const OptimizationLevel O2;
   /// Optimize for fast execution as much as possible.
   ///
   /// This mode is significantly more aggressive in trading off compile time
@@ -86,7 +87,7 @@ class OptimizationLevel final {
   /// order to make even significantly slower compile times at least scale
   /// reasonably. This does not preclude very substantial constant factor
   /// costs though.
-  static const OptimizationLevel O3;
+  LLVM_ABI static const OptimizationLevel O3;
   /// Similar to \c O2 but tries to optimize for small code size instead of
   /// fast execution without triggering significant incremental execution
   /// time slowdowns.
@@ -97,7 +98,7 @@ class OptimizationLevel final {
   /// A consequence of the different core goal is that this should in general
   /// produce substantially smaller executables that still run in
   /// a reasonable amount of time.
-  static const OptimizationLevel Os;
+  LLVM_ABI static const OptimizationLevel Os;
   /// A very specialized mode that will optimize for code size at any and all
   /// costs.
   ///
@@ -105,7 +106,7 @@ class OptimizationLevel final {
   /// any effort taken to reduce the size is worth it regardless of the
   /// execution time impact. You should expect this level to produce rather
   /// slow, but very small, code.
-  static const OptimizationLevel Oz;
+  LLVM_ABI static const OptimizationLevel Oz;
 
   bool isOptimizingForSpeed() const { return SizeLevel == 0 && SpeedLevel > 0; }
 
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 51ccaa53447d7..f13b5c678a894 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,7 +45,7 @@ class PipelineTuningOptions {
 public:
   /// Constructor sets pipeline tuning defaults based on cl::opts. Each option
   /// can be set in the PassBuilder when using a LLVM as a library.
-  PipelineTuningOptions();
+  LLVM_ABI PipelineTuningOptions();
 
   /// Tuning option to set loop interleaving on/off, set based on opt level.
   bool LoopInterleaving;
@@ -126,20 +127,20 @@ class PassBuilder {
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  explicit PassBuilder(TargetMachine *TM = nullptr,
-                       PipelineTuningOptions PTO = PipelineTuningOptions(),
-                       std::optional<PGOOptions> PGOOpt = std::nullopt,
-                       PassInstrumentationCallbacks *PIC = nullptr);
+  LLVM_ABI explicit PassBuilder(
+      TargetMachine *TM = nullptr,
+      PipelineTuningOptions PTO = PipelineTuningOptions(),
+      std::optional<PGOOptions> PGOOpt = std::nullopt,
+      PassInstrumentationCallbacks *PIC = nullptr);
 
   /// Cross register the analysis managers through their proxies.
   ///
   /// This is an interface that can be used to cross register each
   /// AnalysisManager with all the others analysis managers.
-  void crossRegisterProxies(LoopAnalysisManager &LAM,
-                            FunctionAnalysisManager &FAM,
-                            CGSCCAnalysisManager &CGAM,
-                            ModuleAnalysisManager &MAM,
-                            MachineFunctionAnalysisManager *MFAM = nullptr);
+  LLVM_ABI void
+  crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM,
+                       CGSCCAnalysisManager &CGAM, ModuleAnalysisManager &MAM,
+                       MachineFunctionAnalysisManager *MFAM = nullptr);
 
   /// Registers all available module analysis passes.
   ///
@@ -147,7 +148,7 @@ class PassBuilder {
   /// ModuleAnalysisManager with all registered module analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerModuleAnalyses(ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerModuleAnalyses(ModuleAnalysisManager &MAM);
 
   /// Registers all available CGSCC analysis passes.
   ///
@@ -155,7 +156,7 @@ class PassBuilder {
   /// with all registered CGSCC analyses. Callers can still manually register any
   /// additional analyses. Callers can also pre-register analyses and this will
   /// not override those.
-  void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
+  LLVM_ABI void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
 
   /// Registers all available function analysis passes.
   ///
@@ -163,14 +164,14 @@ class PassBuilder {
   /// FunctionAnalysisManager with all registered function analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
+  LLVM_ABI void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
 
   /// Registers all available loop analysis passes.
   ///
   /// This is an interface that can be used to populate a \c LoopAnalysisManager
   /// with all registered loop analyses. Callers can still manually register any
   /// additional analyses.
-  void registerLoopAnalyses(LoopAnalysisManager &LAM);
+  LLVM_ABI void registerLoopAnalyses(LoopAnalysisManager &LAM);
 
   /// Registers all available machine function analysis passes.
   ///
@@ -178,7 +179,8 @@ class PassBuilder {
   /// MachineFunctionAnalysisManager with all registered function analyses.
   /// Callers can still manually register any additional analyses. Callers can
   /// also pre-register analyses and this will not override those.
-  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
+  LLVM_ABI void
+  registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
 
   /// Construct the core LLVM function canonicalization and simplification
   /// pipeline.
@@ -194,9 +196,8 @@ class PassBuilder {
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  FunctionPassManager
-  buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI FunctionPassManager buildFunctionSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module canonicalization and simplification
   /// pipeline.
@@ -213,18 +214,18 @@ class PassBuilder {
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  ModulePassManager buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager buildModuleSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining as well as
   /// the inlining-driven cleanups.
-  ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level,
-                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModuleInlinerWrapperPass
+  buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining with
   /// module inliner pass.
-  ModulePassManager buildModuleInlinerPipeline(OptimizationLevel Level,
-                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager
+  buildModuleInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module optimization pipeline.
   ///
@@ -239,9 +240,8 @@ class PassBuilder {
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager
-  buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                  ThinOrFullLTOPhase LTOPhase);
+  LLVM_ABI ModulePassManager buildModuleOptimizationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase);
 
   /// Build a per-module default optimization pipeline.
   ///
@@ -249,7 +249,7 @@ class PassBuilder {
   /// optimization and code generation without any link-time optimization. It
   /// typically correspond to frontend "-O[123]" options for optimization
   /// levels \c O1, \c O2 and \c O3 resp.
-  ModulePassManager buildPerModuleDefaultPipeline(
+  LLVM_ABI ModulePassManager buildPerModuleDefaultPipeline(
       OptimizationLevel Level,
       ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -258,8 +258,9 @@ class PassBuilder {
   /// This builds a pipeline that runs the LTO/ThinLTO  pre-link pipeline, and
   /// emits a section containing the pre-link bitcode along side the object code
   /// generated in non-LTO compilation.
-  ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
-                                               bool ThinLTO, bool EmitSummary);
+  LLVM_ABI ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
+                                                        bool ThinLTO,
+                                                        bool EmitSummary);
 
   /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
   /// a pass manager.
@@ -268,7 +269,8 @@ class PassBuilder {
   /// a ThinLTO run. It works to minimize the IR which needs to be analyzed
   /// without making irreversible decisions which could be made better during
   /// the LTO run.
-  ModulePassManager buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build a ThinLTO default optimization pipeline to a pass manager.
   ///
@@ -276,9 +278,8 @@ class PassBuilder {
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildThinLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager
-  buildThinLTODefaultPipeline(OptimizationLevel Level,
-                              const ModuleSummaryIndex *ImportSummary);
+  LLVM_ABI ModulePassManager buildThinLTODefaultPipeline(
+      OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary);
 
   /// Build a pre-link, LTO-targeting default optimization pipeline to a pass
   /// manager.
@@ -287,7 +288,8 @@ class PassBuilder {
   /// run. It works to minimize the IR which needs to be analyzed without
   /// making irreversible decisions which could be made better during the LTO
   /// run.
-  ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build an LTO default optimization pipeline to a pass manager.
   ///
@@ -295,13 +297,13 @@ class PassBuilder {
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level,
-                                            ModuleSummaryIndex *ExportSummary);
+  LLVM_ABI ModulePassManager buildLTODefaultPipeline(
+      OptimizationLevel Level, ModuleSummaryIndex *ExportSummary);
 
   /// Build an O0 pipeline with the minimal semantically required passes.
   ///
   /// This should only be used for non-LTO and LTO pre-link pipelines.
-  ModulePassManager
+  LLVM_ABI ModulePassManager
   buildO0DefaultPipeline(OptimizationLevel Level,
                          ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -310,7 +312,7 @@ class PassBuilder {
   ///
   /// This also adds target-specific alias analyses registered via
   /// TargetMachine::registerDefaultAliasAnalyses().
-  AAManager buildDefaultAAPipeline();
+  LLVM_ABI AAManager buildDefaultAAPipeline();
 
   /// Parse a textual pass pipeline description into a \c
   /// ModulePassManager.
@@ -352,7 +354,8 @@ class PassBuilder {
   /// specifically want the pass to run under a adaptor directly. This is
   /// preferred when a pipeline is largely of one type, but one or just a few
   /// passes are of different types(See PassBuilder.cpp for examples).
-  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(ModulePassManager &MPM,
+                                   StringRef PipelineText);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -361,9 +364,12 @@ class PassBuilder {
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText);
-  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText);
-  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(CGSCCPassManager &CGPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(FunctionPassManager &FPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(LoopPassManager &LPM,
+                                   StringRef PipelineText);
   /// @}}
 
   /// Parse a textual MIR pipeline into the provided \c MachineFunctionPass
@@ -375,8 +381,8 @@ class PassBuilder {
   ///
   /// There is no need to specify the pass nesting, and this function
   /// currently cannot handle the pass nesting.
-  Error parsePassPipeline(MachineFunctionPassManager &MFPM,
-                          StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(MachineFunctionPassManager &MFPM,
+                                   StringRef PipelineText);
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
   ///
@@ -393,14 +399,14 @@ class PassBuilder {
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  LLVM_ABI Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Parse RegAllocFilterName to get RegAllocFilterFunc.
-  std::optional<RegAllocFilterFunc>
+  LLVM_ABI std::optional<RegAllocFilterFunc>
   parseRegAllocFilter(StringRef RegAllocFilterName);
 
   /// Print pass names.
-  void printPassNames(raw_ostream &OS);
+  LLVM_ABI void printPassNames(raw_ostream &OS);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -614,16 +620,17 @@ class PassBuilder {
   /// If the PassManager type is not given at the top level of the pipeline
   /// text, this Callback should be used to determine the appropriate stack of
   /// PassManagers and populate the passed ModulePassManager.
-  void registerParseTopLevelPipelineCallback(
+  LLVM_ABI void registerParseTopLevelPipelineCallback(
       const std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>)>
           &C);
 
   /// Add PGOInstrumenation passes for O0 only.
-  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool RunProfileGen,
-                              bool IsCS, bool AtomicCounterUpdate,
-                              std::string ProfileFile,
-                              std::string ProfileRemappingFile,
-                              IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void addPGOInstrPassesForO0(ModulePassManager &MPM,
+                                       bool RunProfileGen, bool IsCS,
+                                       bool AtomicCounterUpdate,
+                                       std::string ProfileFile,
+                                       std::string ProfileRemappingFile,
+                                       IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Returns PIC. External libraries can use this to register pass
   /// instrumentation callbacks.
@@ -634,35 +641,38 @@ class PassBuilder {
   // Invoke the callbacks registered for the various extension points.
   // Custom pipelines should use these to invoke the callbacks registered
   // by TargetMachines and other clients.
-  void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
-                                 OptimizationLevel Level);
-  void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
-                                              OptimizationLevel Level);
-  void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
-                                         OptimizationLevel Level);
-  void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
-                                            OptimizationLevel Level);
-  void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
-                                           OptimizationLevel Level);
-  void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
-                                        OptimizationLevel Level);
-  void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
-                                      OptimizationLevel Level);
-  void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
-                                       OptimizationLevel Level,
-                                       ThinOrFullLTOPhase Phase);
-  void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
-  void invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
-                                                      OptimizationLevel Level);
-  void invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+  LLVM_ABI void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
+                                          OptimizationLevel Level);
+  LLVM_ABI void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
+                                                       OptimizationLevel Level);
+  LLVM_ABI void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
+                                                  OptimizationLevel Level);
+  LLVM_ABI void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
                                                      OptimizationLevel Level);
-  void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level);
-  void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
-                                                    OptimizationLevel Level,
-                                                    ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
+                                                    OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level,
+                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level);
+  LLVM_ABI void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void
+  invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
 
   static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
     if (!Name.consume_front(PassName))
@@ -713,9 +723,9 @@ class PassBuilder {
   /// Handle passes only accept one bool-valued parameter.
   ///
   /// \return false when Params is empty.
-  static Expected<bool> parseSinglePassOption(StringRef Params,
-                                              StringRef OptionName,
-                                              StringRef PassName);
+  LLVM_ABI static Expected<bool> parseSinglePassOption(StringRef Params,
+                                                       StringRef OptionName,
+                                                       StringRef PassName);
 
 private:
   // O1 pass pipeline
@@ -898,7 +908,7 @@ struct NoOpModulePass : PassInfoMixin<NoOpModulePass> {
 /// No-op module analysis.
 class NoOpModuleAnalysis : public AnalysisInfoMixin<NoOpModuleAnalysis> {
   friend AnalysisInfoMixin<NoOpModuleAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -916,7 +926,7 @@ struct NoOpCGSCCPass : PassInfoMixin<NoOpCGSCCPass> {
 /// No-op CGSCC analysis.
 class NoOpCGSCCAnalysis : public AnalysisInfoMixin<NoOpCGSCCAnalysis> {
   friend AnalysisInfoMixin<NoOpCGSCCAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -935,7 +945,7 @@ struct NoOpFunctionPass : PassInfoMixin<NoOpFunctionPass> {
 /// No-op function analysis.
 class NoOpFunctionAnalysis : public AnalysisInfoMixin<NoOpFunctionAnalysis> {
   friend AnalysisInfoMixin<NoOpFunctionAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -968,7 +978,7 @@ struct NoOpMachineFunctionPass : public PassInfoMixin<NoOpMachineFunctionPass> {
 /// No-op loop analysis.
 class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
   friend AnalysisInfoMixin<NoOpLoopAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -978,8 +988,7 @@ class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
 };
 
 /// Common option used by multiple tools to print pipeline passes
-extern cl::opt<bool> PrintPipelinePasses;
-
+LLVM_ABI extern cl::opt<bool> PrintPipelinePasses;
 }
 
 #endif
diff --git a/llvm/include/llvm/Passes/PassPlugin.h b/llvm/include/llvm/Passes/PassPlugin.h
index 013b7a827c47d..947504bc207a7 100644
--- a/llvm/include/llvm/Passes/PassPlugin.h
+++ b/llvm/include/llvm/Passes/PassPlugin.h
@@ -64,7 +64,7 @@ class PassPlugin {
   /// \returns Returns an error if either the library cannot be found or loaded,
   /// there is no public entry point, or the plugin implements the wrong API
   /// version.
-  static Expected<PassPlugin> Load(const std::string &Filename);
+  LLVM_ABI static Expected<PassPlugin> Load(const std::string &Filename);
 
   /// Get the filename of the loaded plugin.
   StringRef getFilename() const { return Filename; }
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index f7a65a88ecf5b..4ee5ab2554868 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 
@@ -46,9 +47,9 @@ class PassInstrumentationCallbacks;
 /// (typically Loop or SCC).
 class PrintIRInstrumentation {
 public:
-  ~PrintIRInstrumentation();
+  LLVM_ABI ~PrintIRInstrumentation();
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   struct PassRunDescriptor {
@@ -104,7 +105,7 @@ class PrintIRInstrumentation {
 class OptNoneInstrumentation {
 public:
   OptNoneInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool DebugLogging;
@@ -116,8 +117,8 @@ class OptPassGateInstrumentation {
   bool HasWrittenIR = false;
 public:
   OptPassGateInstrumentation(LLVMContext &Context) : Context(Context) {}
-  bool shouldRun(StringRef PassName, Any IR);
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI bool shouldRun(StringRef PassName, Any IR);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 };
 
 struct PrintPassOptions {
@@ -136,7 +137,7 @@ class PrintPassInstrumentation {
 public:
   PrintPassInstrumentation(bool Enabled, PrintPassOptions Opts)
       : Enabled(Enabled), Opts(Opts) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool Enabled;
@@ -167,7 +168,7 @@ class PreservedCFGCheckerInstrumentation {
     std::optional<DenseMap<intptr_t, BBGuard>> BBGuards;
     DenseMap<const BasicBlock *, DenseMap<const BasicBlock *, unsigned>> Graph;
 
-    CFG(const Function *F, bool TrackBBLifetime);
+    LLVM_ABI CFG(const Function *F, bool TrackBBLifetime);
 
     bool operator==(const CFG &G) const {
       return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph;
@@ -179,18 +180,18 @@ class PreservedCFGCheckerInstrumentation {
              });
     }
 
-    static void printDiff(raw_ostream &out, const CFG &Before,
-                          const CFG &After);
-    bool invalidate(Function &F, const PreservedAnalyses &PA,
-                    FunctionAnalysisManager::Invalidator &);
+    LLVM_ABI static void printDiff(raw_ostream &out, const CFG &Before,
+                                   const CFG &After);
+    LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
+                             FunctionAnalysisManager::Invalidator &);
   };
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
   SmallVector<StringRef, 8> PassStack;
 #endif
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager &MAM);
 };
 
 // Base class for classes that report changes to the IR.
@@ -208,7 +209,7 @@ class PreservedCFGCheckerInstrumentation {
 // 6.  When a pass is run on an IR that is not interesting (based on options).
 // 7.  When a pass is ignored (pass manager or adapter pass).
 // 8.  To compare two IR representations (of type \p T).
-template <typename IRUnitT> class ChangeReporter {
+template <typename IRUnitT> class LLVM_ABI ChangeReporter {
 protected:
   ChangeReporter(bool RunInVerboseMode) : VerboseMode(RunInVerboseMode) {}
 
@@ -257,7 +258,7 @@ template <typename IRUnitT> class ChangeReporter {
 // An abstract template base class that handles printing banners and
 // reporting when things have not changed or are filtered out.
 template <typename IRUnitT>
-class TextChangeReporter : public ChangeReporter<IRUnitT> {
+class LLVM_ABI TextChangeReporter : public ChangeReporter<IRUnitT> {
 protected:
   TextChangeReporter(bool Verbose);
 
@@ -281,7 +282,7 @@ class TextChangeReporter : public ChangeReporter<IRUnitT> {
 // by unwrapAndPrint.  The string representation is stored in a std::string
 // to preserve it as the IR changes in each pass.  Note that the banner is
 // included in this representation but it is massaged before reporting.
-class IRChangedPrinter : public TextChangeReporter<std::string> {
+class LLVM_ABI IRChangedPrinter : public TextChangeReporter<std::string> {
 public:
   IRChangedPrinter(bool VerboseMode)
       : TextChangeReporter<std::string>(VerboseMode) {}
@@ -298,7 +299,7 @@ class IRChangedPrinter : public TextChangeReporter<std::string> {
                    Any) override;
 };
 
-class IRChangedTester : public IRChangedPrinter {
+class LLVM_ABI IRChangedTester : public IRChangedPrinter {
 public:
   IRChangedTester() : IRChangedPrinter(true) {}
   ~IRChangedTester() override;
@@ -444,7 +445,8 @@ template <typename T> class IRComparer {
 // and added, respectively.  Changes to the IR that do not affect basic
 // blocks are not reported as having changed the IR.  The option
 // -print-module-scope does not affect this change reporter.
-class InLineChangePrinter : public TextChangeReporter<IRDataT<EmptyData>> {
+class LLVM_ABI InLineChangePrinter
+    : public TextChangeReporter<IRDataT<EmptyData>> {
 public:
   InLineChangePrinter(bool VerboseMode, bool ColourMode)
       : TextChangeReporter<IRDataT<EmptyData>>(VerboseMode),
@@ -475,8 +477,8 @@ class VerifyInstrumentation {
 
 public:
   VerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM);
 };
 
 /// This class implements --time-trace functionality for new pass manager.
@@ -484,12 +486,12 @@ class VerifyInstrumentation {
 /// execution time. They collect time tracing info by TimeProfiler.
 class TimeProfilingPassesHandler {
 public:
-  TimeProfilingPassesHandler();
+  LLVM_ABI TimeProfilingPassesHandler();
   // We intend this to be unique per-compilation, thus no copies.
   TimeProfilingPassesHandler(const TimeProfilingPassesHandler &) = delete;
   void operator=(const TimeProfilingPassesHandler &) = delete;
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   // Implementation of pass instrumentation callbacks.
@@ -502,8 +504,8 @@ class TimeProfilingPassesHandler {
 class DCData {
 public:
   // Fill the map with the transitions from basic block \p B.
-  DCData(const BasicBlock &B);
-  DCData(const MachineBasicBlock &B);
+  LLVM_ABI DCData(const BasicBlock &B);
+  LLVM_ABI DCData(const MachineBasicBlock &B);
 
   // Return an iterator to the names of the successor blocks.
   StringMap<std::string>::const_iterator begin() const {
@@ -531,7 +533,7 @@ class DCData {
 
 // A change reporter that builds a website with links to pdf files showing
 // dot control flow graphs with changed instructions shown in colour.
-class DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
+class LLVM_ABI DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
 public:
   DotCfgChangeReporter(bool Verbose);
   ~DotCfgChangeReporter() override;
@@ -578,9 +580,9 @@ class PrintCrashIRInstrumentation {
 public:
   PrintCrashIRInstrumentation()
       : SavedIR("*** Dump of IR Before Last Pass Unknown ***") {}
-  ~PrintCrashIRInstrumentation();
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
-  void reportCrashIR();
+  LLVM_ABI ~PrintCrashIRInstrumentation();
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void reportCrashIR();
 
 protected:
   std::string SavedIR;
@@ -614,26 +616,22 @@ class StandardInstrumentations {
   bool VerifyEach;
 
 public:
+  LLVM_ABI
   StandardInstrumentations(LLVMContext &Context, bool DebugLogging,
                            bool VerifyEach = false,
                            PrintPassOptions PrintPassOpts = PrintPassOptions());
 
   // Register all the standard instrumentation callbacks. If \p FAM is nullptr
   // then PreservedCFGChecker is not enabled.
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM = nullptr);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM = nullptr);
 
   TimePassesHandler &getTimePasses() { return TimePasses; }
 };
 
-extern template class ChangeReporter<std::string>;
-extern template class TextChangeReporter<std::string>;
-
 extern template class BlockDataT<EmptyData>;
 extern template class FuncDataT<EmptyData>;
 extern template class IRDataT<EmptyData>;
-extern template class ChangeReporter<IRDataT<EmptyData>>;
-extern template class TextChangeReporter<IRDataT<EmptyData>>;
 extern template class IRComparer<EmptyData>;
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 0e42789ba932e..48745f7f4d2a6 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -130,6 +130,15 @@ namespace llvm {
     Invalid = 2, ///< Not used.
   };
 
+  enum class WinX64EHUnwindV2Mode {
+    // Don't use unwind v2 (i.e., use v1).
+    Disabled = 0,
+    // Use unwind v2 here possible, otherwise fallback to v1.
+    BestEffort = 1,
+    // Use unwind v2 everywhere, otherwise raise an error.
+    Required = 2,
+  };
+
   } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 51f25c1360b87..f29cbe78a1853 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_CGPASSBUILDEROPTION_H
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
@@ -82,7 +83,7 @@ struct CGPassBuilderOption {
   std::optional<bool> DebugifyCheckAndStripAll;
 };
 
-CGPassBuilderOption getCGPassBuilderOption();
+LLVM_ABI CGPassBuilderOption getCGPassBuilderOption();
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 47617424a9688..27a688bc12abf 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -16,6 +16,7 @@
 
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegister.h"
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 
 namespace llvm {
@@ -43,7 +44,7 @@ class StringRef;
 class TargetMachine;
 class DSOLocalEquivalent;
 
-class TargetLoweringObjectFile : public MCObjectFileInfo {
+class LLVM_ABI TargetLoweringObjectFile : public MCObjectFileInfo {
   /// Name-mangler for global names.
   Mangler *Mang = nullptr;
 
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 906926729ed74..b286efdea3c19 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -28,7 +29,7 @@
 #include <string>
 #include <utility>
 
-extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
 
 namespace llvm {
 
@@ -78,7 +79,7 @@ struct MachineFunctionInfo;
 /// machine.  All target-specific information should be accessible through this
 /// interface.
 ///
-class TargetMachine {
+class LLVM_ABI TargetMachine {
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef DataLayoutString,
                 const Triple &TargetTriple, StringRef CPU, StringRef FS,
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 10638a0ec902f..a7c46921255b8 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Compiler.h"
 
 #include <memory>
 
@@ -158,12 +159,12 @@ class TargetOptions {
 
   /// DisableFramePointerElim - This returns true if frame pointer elimination
   /// optimization should be disabled for the given machine function.
-  bool DisableFramePointerElim(const MachineFunction &MF) const;
+  LLVM_ABI bool DisableFramePointerElim(const MachineFunction &MF) const;
 
   /// FramePointerIsReserved - This returns true if the frame pointer must
   /// always either point to a new frame record or be un-modified in the given
   /// function.
-  bool FramePointerIsReserved(const MachineFunction &MF) const;
+  LLVM_ABI bool FramePointerIsReserved(const MachineFunction &MF) const;
 
   /// If greater than 0, override the default value of
   /// MCAsmInfo::BinutilsVersion.
@@ -219,7 +220,7 @@ class TargetOptions {
   /// truncations).  If this is enabled (set to true), the code generator must
   /// assume that the rounding mode may dynamically change.
   unsigned HonorSignDependentRoundingFPMathOption : 1;
-  bool HonorSignDependentRoundingFPMath() const;
+  LLVM_ABI bool HonorSignDependentRoundingFPMath() const;
 
   /// NoZerosInBSS - By default some codegens place zero-initialized data to
   /// .bss section. This flag disables such behaviour (necessary, e.g. for
@@ -346,7 +347,7 @@ class TargetOptions {
   unsigned EnableDebugEntryValues : 1;
   /// NOTE: There are targets that still do not support the debug entry values
   /// production.
-  bool ShouldEmitDebugEntryValues() const;
+  LLVM_ABI bool ShouldEmitDebugEntryValues() const;
 
   // When set to true, use experimental new debug variable location tracking,
   // which seeks to follow the values of variables rather than their location,
@@ -450,7 +451,7 @@ class TargetOptions {
 
   DenormalMode getRawFP32DenormalMode() const { return FP32DenormalMode; }
 
-  DenormalMode getDenormalMode(const fltSemantics &FPType) const;
+  LLVM_ABI DenormalMode getDenormalMode(const fltSemantics &FPType) const;
 
   /// What exception model to use
   ExceptionHandling ExceptionModel = ExceptionHandling::None;
diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index 6f1723dec5d04..cdcfcdd0e802e 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -32,7 +32,7 @@ namespace llvm {
 class raw_ostream;
 class Triple;
 
-const unsigned MAX_SUBTARGET_WORDS = 5;
+const unsigned MAX_SUBTARGET_WORDS = 6;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 56b30968ffd77..7523ae66429ac 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_IPO_H
 #define LLVM_TRANSFORMS_IPO_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class ModulePass;
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index e6eb756df987d..f19f3292c4798 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -6335,6 +6335,47 @@ struct AAUnderlyingObjects : AbstractAttribute {
                           AA::ValueScope Scope = AA::Interprocedural) const = 0;
 };
 
+/// An abstract interface for identifying pointers from which loads can be
+/// marked invariant.
+struct AAInvariantLoadPointer : public AbstractAttribute {
+  AAInvariantLoadPointer(const IRPosition &IRP) : AbstractAttribute(IRP) {}
+
+  /// See AbstractAttribute::isValidIRPositionForInit
+  static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
+    if (!IRP.getAssociatedType()->isPointerTy())
+      return false;
+
+    return AbstractAttribute::isValidIRPositionForInit(A, IRP);
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAInvariantLoadPointer &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
+
+  /// Return true if the pointer's contents are known to remain invariant.
+  virtual bool isKnownInvariant() const = 0;
+  virtual bool isKnownLocallyInvariant() const = 0;
+
+  /// Return true if the pointer's contents are assumed to remain invariant.
+  virtual bool isAssumedInvariant() const = 0;
+  virtual bool isAssumedLocallyInvariant() const = 0;
+
+  /// See AbstractAttribute::getName().
+  StringRef getName() const override { return "AAInvariantLoadPointer"; }
+
+  /// See AbstractAttribute::getIdAddr().
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAInvariantLoadPointer
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address).
+  static const char ID;
+};
+
 /// An abstract interface for address space information.
 struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
   AAAddressSpace(const IRPosition &IRP, Attributor &A)
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index e156ec469a14f..245414935bc0f 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -46,7 +46,9 @@ class ImplicitControlFlowTracking;
 class LoadInst;
 class LoopInfo;
 class MemDepResult;
+class MemoryAccess;
 class MemoryDependenceResults;
+class MemoryLocation;
 class MemorySSA;
 class MemorySSAUpdater;
 class NonLocalDepResult;
@@ -170,6 +172,10 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     // Value number to PHINode mapping. Used for phi-translate in scalarpre.
     DenseMap<uint32_t, PHINode *> NumberingPhi;
 
+    // Value number to BasicBlock mapping. Used for phi-translate across
+    // MemoryPhis.
+    DenseMap<uint32_t, BasicBlock *> NumberingBB;
+
     // Cache for phi-translate in scalarpre.
     using PhiTranslateMap =
         DenseMap<std::pair<uint32_t, const BasicBlock *>, uint32_t>;
@@ -177,6 +183,9 @@ class GVNPass : public PassInfoMixin<GVNPass> {
 
     AAResults *AA = nullptr;
     MemoryDependenceResults *MD = nullptr;
+    bool IsMDEnabled = false;
+    MemorySSA *MSSA = nullptr;
+    bool IsMSSAEnabled = false;
     DominatorTree *DT = nullptr;
 
     uint32_t NextValueNumber = 1;
@@ -187,12 +196,14 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     Expression createExtractvalueExpr(ExtractValueInst *EI);
     Expression createGEPExpr(GetElementPtrInst *GEP);
     uint32_t lookupOrAddCall(CallInst *C);
+    uint32_t computeLoadStoreVN(Instruction *I);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
                               uint32_t Num, GVNPass &GVN);
     bool areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *Pred,
                           const BasicBlock *PhiBlock, GVNPass &GVN);
     std::pair<uint32_t, bool> assignExpNewValueNum(Expression &Exp);
     bool areAllValsInBB(uint32_t Num, const BasicBlock *BB, GVNPass &GVN);
+    void addMemoryStateToExp(Instruction *I, Expression &Exp);
 
   public:
     LLVM_ABI ValueTable();
@@ -201,6 +212,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     LLVM_ABI ~ValueTable();
     LLVM_ABI ValueTable &operator=(const ValueTable &Arg);
 
+    LLVM_ABI uint32_t lookupOrAdd(MemoryAccess *MA);
     LLVM_ABI uint32_t lookupOrAdd(Value *V);
     LLVM_ABI uint32_t lookup(Value *V, bool Verify = true) const;
     LLVM_ABI uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred,
@@ -216,7 +228,14 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     LLVM_ABI void erase(Value *V);
     void setAliasAnalysis(AAResults *A) { AA = A; }
     AAResults *getAliasAnalysis() const { return AA; }
-    void setMemDep(MemoryDependenceResults *M) { MD = M; }
+    void setMemDep(MemoryDependenceResults *M, bool MDEnabled = true) {
+      MD = M;
+      IsMDEnabled = MDEnabled;
+    }
+    void setMemorySSA(MemorySSA *M, bool MSSAEnabled = false) {
+      MSSA = M;
+      IsMSSAEnabled = MSSAEnabled;
+    }
     void setDomTree(DominatorTree *D) { DT = D; }
     uint32_t getNextUnusedValueNumber() { return NextValueNumber; }
     LLVM_ABI void verifyRemoved(const Value *) const;
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 55e153f289590..df146458b4e6f 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -394,12 +394,9 @@ handleUnreachableTerminator(Instruction *I,
                             SmallVectorImpl<Value *> &PoisonedValues);
 
 /// Remove all instructions from a basic block other than its terminator
-/// and any present EH pad instructions. Returns a pair where the first element
-/// is the number of instructions (excluding debug info intrinsics) that have
-/// been removed, and the second element is the number of debug info intrinsics
+/// and any present EH pad instructions. Returns the number of instructions
 /// that have been removed.
-LLVM_ABI std::pair<unsigned, unsigned>
-removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
+LLVM_ABI unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
 
 /// Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index ab0bd3a5a9962..f20ae1809aa56 100644
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 #define LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 template <typename T> class ArrayRef;
diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index 77af77e5ec269..e9782dafed618 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -14,6 +14,7 @@
 #define LLVM_XRAY_BLOCKINDEXER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <cstdint>
 #include <vector>
@@ -23,7 +24,7 @@ namespace xray {
 
 // The BlockIndexer will gather all related records associated with a
 // process+thread and group them by 'Block'.
-class BlockIndexer : public RecordVisitor {
+class LLVM_ABI BlockIndexer : public RecordVisitor {
 public:
   struct Block {
     uint64_t ProcessID;
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index 2f9fed668069c..caf78c5c4a5a6 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_BLOCKPRINTER_H
 #define LLVM_XRAY_BLOCKPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/RecordPrinter.h"
@@ -20,7 +21,7 @@
 namespace llvm {
 namespace xray {
 
-class BlockPrinter : public RecordVisitor {
+class LLVM_ABI BlockPrinter : public RecordVisitor {
   enum class State {
     Start,
     Preamble,
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index 2450ad89ffe3d..b88785c393e37 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -13,12 +13,13 @@
 #ifndef LLVM_XRAY_BLOCKVERIFIER_H
 #define LLVM_XRAY_BLOCKVERIFIER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class BlockVerifier : public RecordVisitor {
+class LLVM_ABI BlockVerifier : public RecordVisitor {
 public:
   // We force State elements to be size_t, to be used as indices for containers.
   enum class State : std::size_t {
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 8fff9fb861582..473777f0e04f2 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDCONSUMER_H
 #define LLVM_XRAY_FDRRECORDCONSUMER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <algorithm>
@@ -25,7 +26,7 @@ class RecordConsumer {
 
 // This consumer will collect all the records into a vector of records, in
 // arrival order.
-class LogBuilderConsumer : public RecordConsumer {
+class LLVM_ABI LogBuilderConsumer : public RecordConsumer {
   std::vector<std::unique_ptr<Record>> &Records;
 
 public:
@@ -38,7 +39,7 @@ class LogBuilderConsumer : public RecordConsumer {
 // A PipelineConsumer applies a set of visitors to every consumed Record, in the
 // order by which the visitors are added to the pipeline in the order of
 // appearance.
-class PipelineConsumer : public RecordConsumer {
+class LLVM_ABI PipelineConsumer : public RecordConsumer {
   std::vector<RecordVisitor *> Visitors;
 
 public:
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 25c123aec1b29..083b57139d397 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDPRODUCER_H
 #define LLVM_XRAY_FDRRECORDPRODUCER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -24,7 +25,7 @@ class RecordProducer {
   virtual ~RecordProducer() = default;
 };
 
-class FileBasedRecordProducer : public RecordProducer {
+class LLVM_ABI FileBasedRecordProducer : public RecordProducer {
   const XRayFileHeader &Header;
   DataExtractor &E;
   uint64_t &OffsetPtr;
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 8af88f5b0e132..7ee8db61b2106 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FDRRECORDS_H
 #define LLVM_XRAY_FDRRECORDS_H
 
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <string>
 
@@ -47,7 +48,7 @@ class Record {
     RK_Function,
   };
 
-  static StringRef kindToString(RecordKind K);
+  LLVM_ABI static StringRef kindToString(RecordKind K);
 
 private:
   const RecordKind T;
@@ -107,7 +108,7 @@ class MetadataRecord : public Record {
 // What follows are specific Metadata record types which encapsulate the
 // information associated with specific metadata record types in an FDR mode
 // log.
-class BufferExtents : public MetadataRecord {
+class LLVM_ABI BufferExtents : public MetadataRecord {
   uint64_t Size = 0;
   friend class RecordInitializer;
 
@@ -130,7 +131,7 @@ class BufferExtents : public MetadataRecord {
   }
 };
 
-class WallclockRecord : public MetadataRecord {
+class LLVM_ABI WallclockRecord : public MetadataRecord {
   uint64_t Seconds = 0;
   uint32_t Nanos = 0;
   friend class RecordInitializer;
@@ -155,7 +156,7 @@ class WallclockRecord : public MetadataRecord {
   }
 };
 
-class NewCPUIDRecord : public MetadataRecord {
+class LLVM_ABI NewCPUIDRecord : public MetadataRecord {
   uint16_t CPUId = 0;
   uint64_t TSC = 0;
   friend class RecordInitializer;
@@ -181,7 +182,7 @@ class NewCPUIDRecord : public MetadataRecord {
   }
 };
 
-class TSCWrapRecord : public MetadataRecord {
+class LLVM_ABI TSCWrapRecord : public MetadataRecord {
   uint64_t BaseTSC = 0;
   friend class RecordInitializer;
 
@@ -203,7 +204,7 @@ class TSCWrapRecord : public MetadataRecord {
   }
 };
 
-class CustomEventRecord : public MetadataRecord {
+class LLVM_ABI CustomEventRecord : public MetadataRecord {
   int32_t Size = 0;
   uint64_t TSC = 0;
   uint16_t CPU = 0;
@@ -232,7 +233,7 @@ class CustomEventRecord : public MetadataRecord {
   }
 };
 
-class CustomEventRecordV5 : public MetadataRecord {
+class LLVM_ABI CustomEventRecordV5 : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   std::string Data{};
@@ -259,7 +260,7 @@ class CustomEventRecordV5 : public MetadataRecord {
   }
 };
 
-class TypedEventRecord : public MetadataRecord {
+class LLVM_ABI TypedEventRecord : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   uint16_t EventType = 0;
@@ -288,7 +289,7 @@ class TypedEventRecord : public MetadataRecord {
   }
 };
 
-class CallArgRecord : public MetadataRecord {
+class LLVM_ABI CallArgRecord : public MetadataRecord {
   uint64_t Arg = 0;
   friend class RecordInitializer;
 
@@ -310,7 +311,7 @@ class CallArgRecord : public MetadataRecord {
   }
 };
 
-class PIDRecord : public MetadataRecord {
+class LLVM_ABI PIDRecord : public MetadataRecord {
   int32_t PID = 0;
   friend class RecordInitializer;
 
@@ -333,7 +334,7 @@ class PIDRecord : public MetadataRecord {
   }
 };
 
-class NewBufferRecord : public MetadataRecord {
+class LLVM_ABI NewBufferRecord : public MetadataRecord {
   int32_t TID = 0;
   friend class RecordInitializer;
 
@@ -356,7 +357,7 @@ class NewBufferRecord : public MetadataRecord {
   }
 };
 
-class EndBufferRecord : public MetadataRecord {
+class LLVM_ABI EndBufferRecord : public MetadataRecord {
 public:
   EndBufferRecord()
       : MetadataRecord(RecordKind::RK_Metadata_EndOfBuffer,
@@ -369,7 +370,7 @@ class EndBufferRecord : public MetadataRecord {
   }
 };
 
-class FunctionRecord : public Record {
+class LLVM_ABI FunctionRecord : public Record {
   RecordTypes Kind;
   int32_t FuncId = 0;
   uint32_t Delta = 0;
@@ -415,7 +416,7 @@ class RecordVisitor {
   virtual Error visit(TypedEventRecord &) = 0;
 };
 
-class RecordInitializer : public RecordVisitor {
+class LLVM_ABI RecordInitializer : public RecordVisitor {
   DataExtractor &E;
   uint64_t &OffsetPtr;
   uint16_t Version;
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index 40d5f5af91c92..a3dc58e03333e 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -12,8 +12,9 @@
 #ifndef LLVM_XRAY_FDRTRACEWRITER_H
 #define LLVM_XRAY_FDRTRACEWRITER_H
 
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
@@ -26,7 +27,7 @@ namespace xray {
 /// generate various kinds of execution traces without using the XRay runtime.
 /// Note that this writer does not do any validation, but uses the types of
 /// records defined in the FDRRecords.h file.
-class FDRTraceWriter : public RecordVisitor {
+class LLVM_ABI FDRTraceWriter : public RecordVisitor {
 public:
   // Construct an FDRTraceWriter associated with an output stream.
   explicit FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H);
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index 485d26d71456b..ecdb975a30661 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FILEHEADERREADER_H
 #define LLVM_XRAY_FILEHEADERREADER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -23,8 +24,8 @@ namespace xray {
 
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr);
+LLVM_ABI Expected<XRayFileHeader>
+readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
 
 } // namespace xray
 } // namespace llvm
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index 1979108ff4133..54737e226df89 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -15,6 +15,7 @@
 #define LLVM_XRAY_INSTRUMENTATIONMAP_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -31,7 +32,8 @@ class InstrumentationMap;
 
 /// Loads the instrumentation map from |Filename|. This auto-deduces the type of
 /// the instrumentation map.
-Expected<InstrumentationMap> loadInstrumentationMap(StringRef Filename);
+LLVM_ABI Expected<InstrumentationMap>
+loadInstrumentationMap(StringRef Filename);
 
 /// Represents an XRay instrumentation sled entry from an object file.
 struct SledEntry {
@@ -83,17 +85,18 @@ class InstrumentationMap {
   FunctionAddressMap FunctionAddresses;
   FunctionAddressReverseMap FunctionIds;
 
-  friend Expected<InstrumentationMap> loadInstrumentationMap(StringRef);
+  LLVM_ABI_FRIEND friend Expected<InstrumentationMap>
+      loadInstrumentationMap(StringRef);
 
 public:
   /// Provides a raw accessor to the unordered map of function addresses.
   const FunctionAddressMap &getFunctionAddresses() { return FunctionAddresses; }
 
   /// Returns an XRay computed function id, provided a function address.
-  std::optional<int32_t> getFunctionId(uint64_t Addr) const;
+  LLVM_ABI std::optional<int32_t> getFunctionId(uint64_t Addr) const;
 
   /// Returns the function address for a function id.
-  std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
+  LLVM_ABI std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
 
   /// Provide read-only access to the entries of the instrumentation map.
   const SledContainer &sleds() const { return Sleds; };
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index 79d9b53387f39..e30c01e489d33 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include <list>
 #include <utility>
@@ -34,18 +35,18 @@ class Trace;
 ///
 /// For any errors encountered in the loading of the profile data from
 /// |Filename|, this function will return an Error condition appropriately.
-Expected<Profile> loadProfile(StringRef Filename);
+LLVM_ABI Expected<Profile> loadProfile(StringRef Filename);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by Thread ID.
-Profile mergeProfilesByThread(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByThread(const Profile &L, const Profile &R);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by function call stack.
-Profile mergeProfilesByStack(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByStack(const Profile &L, const Profile &R);
 
 /// This function takes a Trace and creates a Profile instance from it.
-Expected<Profile> profileFromTrace(const Trace &T);
+LLVM_ABI Expected<Profile> profileFromTrace(const Trace &T);
 
 /// Profile instances are thread-compatible.
 class Profile {
@@ -68,11 +69,11 @@ class Profile {
   ///
   /// Returns an error if |P| had not been interned before into the Profile.
   ///
-  Expected<std::vector<FuncID>> expandPath(PathID P) const;
+  LLVM_ABI Expected<std::vector<FuncID>> expandPath(PathID P) const;
 
   /// The stack represented in |P| must be in stack order (leaf to root). This
   /// will always return the same PathID for |P| that has the same sequence.
-  PathID internPath(ArrayRef<FuncID> P);
+  LLVM_ABI PathID internPath(ArrayRef<FuncID> P);
 
   /// Appends a fully-formed Block instance into the Profile.
   ///
@@ -80,7 +81,7 @@ class Profile {
   ///
   ///    - The PathData component of the Block is empty
   ///
-  Error addBlock(Block &&B);
+  LLVM_ABI Error addBlock(Block &&B);
 
   Profile() = default;
   ~Profile() = default;
@@ -99,8 +100,8 @@ class Profile {
     return *this;
   }
 
-  Profile(const Profile &);
-  Profile &operator=(const Profile &);
+  LLVM_ABI Profile(const Profile &);
+  LLVM_ABI Profile &operator=(const Profile &);
 
   friend void swap(Profile &L, Profile &R) {
     using std::swap;
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 8ca4794dce5e2..5d2c27757255a 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -13,13 +13,14 @@
 #ifndef LLVM_XRAY_RECORDPRINTER_H
 #define LLVM_XRAY_RECORDPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class RecordPrinter : public RecordVisitor {
+class LLVM_ABI RecordPrinter : public RecordVisitor {
   raw_ostream &OS;
   std::string Delim;
 
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index eb1f03b2a0d4a..af1d35c67817b 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -50,7 +51,7 @@ class Trace {
 
   typedef std::vector<XRayRecord>::const_iterator citerator;
 
-  friend Expected<Trace> loadTrace(const DataExtractor &, bool);
+  LLVM_ABI_FRIEND friend Expected<Trace> loadTrace(const DataExtractor &, bool);
 
 public:
   using size_type = RecordVector::size_type;
@@ -68,11 +69,12 @@ class Trace {
 
 /// This function will attempt to load XRay trace records from the provided
 /// |Filename|.
-Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 
 /// This function will attempt to load XRay trace records from the provided
 /// DataExtractor.
-Expected<Trace> loadTrace(const DataExtractor &Extractor, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
+                                   bool Sort = false);
 
 } // namespace xray
 } // namespace llvm
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 6d1dafbae60b9..1e2f05b60a9a3 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -343,9 +343,6 @@ void AliasSetTracker::add(AnyMemTransferInst *MTI) {
 }
 
 void AliasSetTracker::addUnknown(Instruction *Inst) {
-  if (isa<DbgInfoIntrinsic>(Inst))
-    return; // Ignore DbgInfo Intrinsics.
-
   if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index 5d1af52e8ab58..d7695e5cfc0d3 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -34,8 +34,7 @@ CallGraph::CallGraph(Module &M)
       CallsExternalNode(std::make_unique<CallGraphNode>(this, nullptr)) {
   // Add every interesting function to the call graph.
   for (Function &F : M)
-    if (!isDbgInfoIntrinsic(F.getIntrinsicID()))
-      addToCallGraph(&F);
+    addToCallGraph(&F);
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
@@ -101,7 +100,7 @@ void CallGraph::populateCallGraphNode(CallGraphNode *Node) {
         const Function *Callee = Call->getCalledFunction();
         if (!Callee)
           Node->addCalledFunction(Call, CallsExternalNode.get());
-        else if (!isDbgInfoIntrinsic(Callee->getIntrinsicID()))
+        else
           Node->addCalledFunction(Call, getOrInsertFunction(Callee));
 
         // Add reference to callback functions.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2b7a438a9ef01..b58f9b26a8651 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -432,6 +432,10 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
   assert(ByteOffset <= DL.getTypeAllocSize(C->getType()) &&
          "Out of range access");
 
+  // Reading type padding, return zero.
+  if (ByteOffset >= DL.getTypeStoreSize(C->getType()))
+    return true;
+
   // If this element is zero or undefined, we can just return since *CurPtr is
   // zero initialized.
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index d7e2a3fa4fc59..6694d5cc06c8c 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -46,8 +46,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "demanded-bits"
 
 static bool isAlwaysLive(Instruction *I) {
-  return I->isTerminator() || isa<DbgInfoIntrinsic>(I) || I->isEHPad() ||
-         I->mayHaveSideEffects();
+  return I->isTerminator() || I->isEHPad() || I->mayHaveSideEffects();
 }
 
 void DemandedBits::determineLiveOperandBits(
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 0f7303c1b0917..fa38c35796a0e 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -13,7 +13,9 @@
 
 #include "llvm/Analysis/IR2Vec.h"
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
@@ -190,7 +192,8 @@ Embedding SymbolicEmbedder::getOperandEmbedding(const Value *Op) const {
 void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
-  for (const auto &I : BB) {
+  // We consider only the non-debug and non-pseudo instructions
+  for (const auto &I : BB.instructionsWithoutDebug()) {
     Embedding InstVector(Dimension, 0);
 
     const auto OpcVec = lookupVocab(I.getOpcodeName());
@@ -215,9 +218,11 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
 void SymbolicEmbedder::computeEmbeddings() const {
   if (F.isDeclaration())
     return;
-  for (const auto &BB : F) {
-    computeEmbeddings(BB);
-    FuncVector += BBVecMap[&BB];
+
+  // Consider only the basic blocks that are reachable from entry
+  for (const BasicBlock *BB : depth_first(&F)) {
+    computeEmbeddings(*BB);
+    FuncVector += BBVecMap[BB];
   }
 }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index e397a228afee0..d1ac8d9fbdfd1 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6969,6 +6969,23 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
     }
     return nullptr;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *Vec = Call->getArgOperand(0);
+    Value *Mask = Call->getArgOperand(1);
+    Value *EVL = Call->getArgOperand(2);
+
+    Value *X;
+    // vp.reverse(vp.reverse(X)) == X (with all ones mask and matching EVL)
+    if (match(Mask, m_AllOnes()) &&
+        match(Vec, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                       m_Value(X), m_AllOnes(), m_Specific(EVL))))
+      return X;
+
+    // vp.reverse(splat(X)) -> splat(X) (regardless of mask and EVL)
+    if (isSplatValue(Vec))
+      return Vec;
+    return nullptr;
+  }
   default:
     return nullptr;
   }
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122cd..71a75b496455a 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -434,7 +434,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &S
     // If we see a free or a call which may write to memory (i.e. which might do
     // a free) the pointer could be marked invalid.
     if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
-        !isa<LifetimeIntrinsic>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+        !isa<LifetimeIntrinsic>(BBI))
       return false;
 
     Value *AccessedPtr;
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 6b7a3e1ffe347..e0b7f65d18a30 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -428,6 +428,9 @@ llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI,
 Constant *llvm::getInitialValueOfAllocation(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             Type *Ty) {
+  if (isa<AllocaInst>(V))
+    return UndefValue::get(Ty);
+
   auto *Alloc = dyn_cast<CallBase>(V);
   if (!Alloc)
     return nullptr;
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index f062189bac6a0..d6f490cb69a52 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -188,9 +188,6 @@ MemDepResult MemoryDependenceResults::getCallDependencyFrom(
   // Walk backwards through the block, looking for dependencies.
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
-    // Debug intrinsics don't cause dependences and should not affect Limit
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
 
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
@@ -432,11 +429,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-      // Debug intrinsics don't (and can't) cause dependencies.
-      if (isa<DbgInfoIntrinsic>(II))
-        continue;
-
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --*Limit;
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 3b42bb412b9ba..c8daab7abde18 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -111,7 +111,9 @@ MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
 
 std::optional<MemoryLocation>
 MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
-  if (!CB->onlyAccessesArgMemory())
+  // Check that the only possible writes are to arguments.
+  MemoryEffects WriteME = CB->getMemoryEffects() & MemoryEffects::writeOnly();
+  if (!WriteME.onlyAccessesArgPointees())
     return std::nullopt;
 
   if (CB->hasOperandBundles())
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2dfe625eb0dcc..dd309bc2c54a8 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10892,7 +10892,12 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
     }
     break;
   case ICmpInst::ICMP_UGE:
-    if (!getUnsignedRangeMin(RHS).isMinValue()) {
+    // If RHS is an op we can fold the -1, try that first.
+    // Otherwise prefer LHS to preserve the nuw flag.
+    if ((isa<SCEVConstant>(RHS) ||
+         (isa<SCEVAddExpr, SCEVAddRecExpr>(RHS) &&
+          isa<SCEVConstant>(cast<SCEVNAryExpr>(RHS)->getOperand(0)))) &&
+        !getUnsignedRangeMin(RHS).isMinValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
@@ -10901,6 +10906,10 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
+    } else if (!getUnsignedRangeMin(RHS).isMinValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
+      Pred = ICmpInst::ICMP_UGT;
+      Changed = true;
     }
     break;
   default:
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c8b568354965d..a3ed093134390 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1299,6 +1299,14 @@ static const VecDesc VecFuncs_LIBMVEC_X86[] = {
 #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 };
 
+static const VecDesc VecFuncs_LIBMVEC_AARCH64[] = {
+#define TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX, CC)               \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX, CC},
+#include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+};
+
 static const VecDesc VecFuncs_MASSV[] = {
 #define TLI_DEFINE_MASSV_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
@@ -1376,6 +1384,10 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     case llvm::Triple::x86_64:
       addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
       break;
+    case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
+      addVectorizableFunctions(VecFuncs_LIBMVEC_AARCH64);
+      break;
     }
     break;
   }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9df667926faf0..a17417cb5189c 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7846,8 +7846,6 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(
    iterator_range<BasicBlock::const_iterator> Range, unsigned ScanLimit) {
   assert(ScanLimit && "scan limit must be non-zero");
   for (const Instruction &I : Range) {
-    if (isa<DbgInfoIntrinsic>(I))
-        continue;
     if (--ScanLimit == 0)
       return false;
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
@@ -8050,8 +8048,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     // well-defined operands.
 
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         break;
 
@@ -8076,8 +8072,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
 
   while (true) {
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         return false;
       if (mustTriggerUB(&I, YieldsPoison))
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index b9b10a541b263..0d17dc175fed9 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -911,6 +911,18 @@ StringRef llvm::dwarf::RLEString(unsigned RLE) {
   }
 }
 
+StringRef (*const llvm::dwarf::EnumTraits<Tag>::StringFn)(unsigned) = TagString;
+StringRef (*const llvm::dwarf::EnumTraits<Attribute>::StringFn)(unsigned) =
+    AttributeString;
+StringRef (*const llvm::dwarf::EnumTraits<Form>::StringFn)(unsigned) =
+    FormEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LocationAtom>::StringFn)(unsigned) =
+    OperationEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
+    LNStandardString;
+StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
+    IndexString;
+
 constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2481a9bd3ce74..bfe6e7d6a802a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -895,7 +895,10 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   }
 }
 
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+std::optional<unsigned>
+DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  // Args[0] is the return type.
+  std::optional<unsigned> ObjectPointerIndex;
   for (unsigned i = 1, N = Args.size(); i < N; ++i) {
     const DIType *Ty = Args[i];
     if (!Ty) {
@@ -906,8 +909,16 @@ void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
       addType(Arg, Ty);
       if (Ty->isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
+
+      if (Ty->isObjectPointer()) {
+        assert(!ObjectPointerIndex &&
+               "Can't have more than one object pointer");
+        ObjectPointerIndex = i;
+      }
     }
   }
+
+  return ObjectPointerIndex;
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
@@ -1458,7 +1469,20 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    constructSubprogramArguments(SPDie, Args);
+    //
+    // Encode the object pointer as an index instead of a DIE reference in order
+    // to minimize the affect on the .debug_info size.
+    if (std::optional<unsigned> ObjectPointerIndex =
+            constructSubprogramArguments(SPDie, Args)) {
+      if (getDwarfDebug().tuneForLLDB() &&
+          getDwarfDebug().getDwarfVersion() >= 5) {
+        // 0th index in Args is the return type, hence adjust by 1. In DWARF
+        // we want the first parameter to be at index 0.
+        assert(*ObjectPointerIndex > 0);
+        addSInt(SPDie, dwarf::DW_AT_object_pointer,
+                dwarf::DW_FORM_implicit_const, *ObjectPointerIndex - 1);
+      }
+    }
   }
 
   addThrownTypes(SPDie, SP->getThrownTypes());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index e1156bccfb1ab..43bf197563867 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -273,7 +273,11 @@ class DwarfUnit : public DIEUnit {
   void constructContainingTypeDIEs();
 
   /// Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+  ///
+  /// \returns The index of the object parameter in \c Args if one exists.
+  /// Returns std::nullopt otherwise.
+  std::optional<unsigned> constructSubprogramArguments(DIE &Buffer,
+                                                       DITypeRefArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 3792b456c836e..43574a54c37dd 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -896,12 +896,7 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   BasicBlock::iterator BBI = BI->getIterator();
   if (BBI != BB->begin()) {
     --BBI;
-    while (isa<DbgInfoIntrinsic>(BBI)) {
-      if (BBI == BB->begin())
-        break;
-      --BBI;
-    }
-    if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+    if (!isa<PHINode>(BBI))
       return nullptr;
   }
 
@@ -2981,10 +2976,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
   // Make sure there are no instructions between the first instruction
   // and return.
   BasicBlock::const_iterator BI = BB->getFirstNonPHIIt();
-  // Skip over debug and the bitcast.
-  while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
-         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(&*BI) ||
-         isFakeUse(&*BI))
+  // Skip over pseudo-probes and the bitcast.
+  while (&*BI == BCI || &*BI == EVI || isa<PseudoProbeInst>(BI) ||
+         isLifetimeEndOrBitCastFor(&*BI) || isFakeUse(&*BI))
     BI = std::next(BI);
   if (&*BI != RetI)
     return false;
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index e3e6c72165ebb..0f2c580c759cf 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index f68420ed66e4b..1c4150127a908 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 48b406e016c05..c3c5a0f5102d7 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d62ded171f4f..934199e414c7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -396,6 +396,8 @@ namespace {
     bool PromoteLoad(SDValue Op);
 
     SDValue foldShiftToAvg(SDNode *N);
+    // Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)`
+    SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT);
 
     SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                 SDValue RHS, SDValue True, SDValue False,
@@ -7204,6 +7206,32 @@ static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand,
   return DAG.getNode(LogicOpcode, DL, VT, CombinedShifts, W);
 }
 
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue M, X, Y;
+  if (sd_match(Node,
+               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+    EVT VT = M.getValueType();
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
+    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -7541,6 +7569,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return DAG.getNode(ISD::AND, DL, VT, X,
                          DAG.getNOT(DL, DAG.getNode(Opc, DL, VT, Y, Z), VT));
 
+  // Fold (and X, (add (not Y), Z)) -> (and X, (not (sub Y, Z)))
+  // Fold (and X, (sub (not Y), Z)) -> (and X, (not (add Y, Z)))
+  if (TLI.hasAndNot(SDValue(N, 0)))
+    if (SDValue Folded = foldBitwiseOpWithNeg(N, DL, VT))
+      return Folded;
+
   // Fold (and (srl X, C), 1) -> (srl X, BW-1) for signbit extraction
   // If we are shifting down an extended sign bit, see if we can simplify
   // this to shifting the MSB directly to expose further simplifications.
@@ -8128,32 +8162,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
-                               const TargetLowering &TLI, const SDLoc &DL) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-
-  // If the target supports and-not, don't fold this.
-  if (TLI.hasAndNot(SDValue(Node, 0)))
-    return SDValue();
-
-  SDValue M, X, Y;
-  if (sd_match(Node,
-               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
-                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
-    EVT VT = M.getValueType();
-    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
-    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
-    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
-  }
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -11252,19 +11260,13 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
   if (N->getOpcode() == ISD::TRUNCATE)
     N = N->getOperand(0).getNode();
 
-  if (N->getOpcode() != ISD::ABS)
-    return SDValue();
-
   EVT VT = N->getValueType(0);
-  SDValue AbsOp1 = N->getOperand(0);
   SDValue Op0, Op1;
 
-  if (AbsOp1.getOpcode() != ISD::SUB)
+  if (!sd_match(N, m_Abs(m_Sub(m_Value(Op0), m_Value(Op1)))))
     return SDValue();
 
-  Op0 = AbsOp1.getOperand(0);
-  Op1 = AbsOp1.getOperand(1);
-
+  SDValue AbsOp0 = N->getOperand(0);
   unsigned Opc0 = Op0.getOpcode();
 
   // Check if the operands of the sub are (zero|sign)-extended.
@@ -11274,7 +11276,7 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N, const SDLoc &DL) {
        Opc0 != ISD::SIGN_EXTEND_INREG)) {
     // fold (abs (sub nsw x, y)) -> abds(x, y)
     // Don't fold this for unsupported types as we lose the NSW handling.
-    if (AbsOp1->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) &&
+    if (AbsOp0->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) &&
         TLI.preferABDSToABSWithNSW(VT)) {
       SDValue ABD = DAG.getNode(ISD::ABDS, DL, VT, Op0, Op1);
       return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
@@ -11652,6 +11654,22 @@ SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
   return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
 }
 
+SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) {
+  unsigned Opc = N->getOpcode();
+  SDValue X, Y, Z;
+  if (sd_match(
+          N, m_BitwiseLogic(m_Value(X), m_Add(m_Not(m_Value(Y)), m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::SUB, DL, VT, Y, Z), VT));
+
+  if (sd_match(N, m_BitwiseLogic(m_Value(X), m_Sub(m_OneUse(m_Not(m_Value(Y))),
+                                                   m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::ADD, DL, VT, Y, Z), VT));
+
+  return SDValue();
+}
+
 /// Generate Min/Max node
 SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                          SDValue RHS, SDValue True,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 45a37622a531b..b0e3f534e2aaa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7377,6 +7377,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) &&
         VT.getScalarType() == MVT::i1)
       return getNode(ISD::XOR, DL, VT, N1, N2);
+    // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
+    if (Opcode == ISD::ADD && N1.getOpcode() == ISD::VSCALE &&
+        N2.getOpcode() == ISD::VSCALE) {
+      const APInt &C1 = N1->getConstantOperandAPInt(0);
+      const APInt &C2 = N2->getConstantOperandAPInt(0);
+      return getVScale(DL, VT, C1 + C2);
+    }
     break;
   case ISD::MUL:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c63eb7fc6b374..ec0c5473b0db0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1320,10 +1320,7 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
-  // Increase the SDNodeOrder if dealing with a non-debug instruction.
-  if (!isa<DbgInfoIntrinsic>(I))
-    ++SDNodeOrder;
-
+  ++SDNodeOrder;
   CurInst = &I;
 
   // Set inserted listener only if required.
@@ -1791,8 +1788,26 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return DAG.getConstant(*CI, getCurSDLoc(), VT);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      SDLoc DL = getCurSDLoc();
+
+      // DAG.getConstant() may attempt to legalise the vector constant which can
+      // significantly change the combines applied to the DAG. To reduce the
+      // divergence when enabling ConstantInt based vectors we try to construct
+      // the DAG in the same way as shufflevector based splats. TODO: The
+      // divergence sometimes leads to better optimisations. Ideally we should
+      // prevent DAG.getConstant() from legalising too early but there are some
+      // degradations preventing this.
+      if (VT.isScalableVector())
+        return DAG.getNode(
+            ISD::SPLAT_VECTOR, DL, VT,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      if (VT.isFixedLengthVector())
+        return DAG.getSplatBuildVector(
+            VT, DL,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      return DAG.getConstant(*CI, DL, VT);
+    }
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b02a03c0b0cb2..ac6d25f141ec6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1507,7 +1507,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
                                       const FunctionLoweringInfo &FuncInfo) {
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
          !I->isTerminator() &&     // Terminators aren't folded.
-         !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded.
          !I->isEHPad() &&             // EH pad instructions aren't folded.
          !FuncInfo.isExportedInst(I); // Exported instrs must be computed.
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a0ffb4b6d5a4c..52f19cc6e1ab0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -419,7 +420,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   NewLHS = Call.first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
 
-  CCCode = getCmpLibcallCC(LC1);
+  CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC1));
   if (ShouldInvertCC) {
     assert(RetVT.isInteger());
     CCCode = getSetCCInverse(CCCode, RetVT);
@@ -441,7 +442,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 
     SDValue Tmp = DAG.getSetCC(dl, SetCCVT, NewLHS, NewRHS, CCCode);
     auto Call2 = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl, Chain);
-    CCCode = getCmpLibcallCC(LC2);
+    CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC2));
     if (ShouldInvertCC)
       CCCode = getSetCCInverse(CCCode, RetVT);
     NewLHS = DAG.getSetCC(dl, SetCCVT, Call2.first, NewRHS, CCCode);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 935afaf9dd550..b1afdc2a3ac39 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1030,7 +1030,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If type is to be expanded, split the vector.
     //  <4 x i140> -> <2 x i140>
     if (LK.first == TypeExpandInteger) {
-      if (VT.getVectorElementCount().isScalable())
+      if (NumElts.isScalable() && NumElts.getKnownMinValue() == 1)
         return LegalizeKind(TypeScalarizeScalableVector, EltVT);
       return LegalizeKind(TypeSplitVector,
                           VT.getHalfNumVectorElementsVT(Context));
diff --git a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
index cc9734f9f22be..86e74110b15ea 100644
--- a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMDebugInfoDWARF
   DWARFAbbreviationDeclaration.cpp
   DWARFAddressRange.cpp
   DWARFAcceleratorTable.cpp
+  DWARFCFIPrinter.cpp
   DWARFCFIProgram.cpp
   DWARFCompileUnit.cpp
   DWARFContext.cpp
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
new file mode 100644
index 0000000000000..e52f671e4fa1c
--- /dev/null
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
@@ -0,0 +1,121 @@
+//===- DWARFCFIPrinter.cpp - Print the cfi-portions of .debug_frame -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <optional>
+
+using namespace llvm;
+using namespace dwarf;
+
+static void printRegister(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                          unsigned RegNum) {
+  if (DumpOpts.GetNameForDWARFReg) {
+    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
+    if (!RegName.empty()) {
+      OS << RegName;
+      return;
+    }
+  }
+  OS << "reg" << RegNum;
+}
+
+/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
+static void printOperand(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                         const CFIProgram &P,
+                         const CFIProgram::Instruction &Instr,
+                         unsigned OperandIdx, uint64_t Operand,
+                         std::optional<uint64_t> &Address) {
+  assert(OperandIdx < CFIProgram::MaxOperands);
+  uint8_t Opcode = Instr.Opcode;
+  CFIProgram::OperandType Type = P.getOperandTypes()[Opcode][OperandIdx];
+
+  switch (Type) {
+  case CFIProgram::OT_Unset: {
+    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
+    auto OpcodeName = P.callFrameString(Opcode);
+    if (!OpcodeName.empty())
+      OS << " " << OpcodeName;
+    else
+      OS << format(" Opcode %x", Opcode);
+    break;
+  }
+  case CFIProgram::OT_None:
+    break;
+  case CFIProgram::OT_Address:
+    OS << format(" %" PRIx64, Operand);
+    Address = Operand;
+    break;
+  case CFIProgram::OT_Offset:
+    // The offsets are all encoded in a unsigned form, but in practice
+    // consumers use them signed. It's most certainly legacy due to
+    // the lack of signed variants in the first Dwarf standards.
+    OS << format(" %+" PRId64, int64_t(Operand));
+    break;
+  case CFIProgram::OT_FactoredCodeOffset: // Always Unsigned
+    if (P.codeAlign())
+      OS << format(" %" PRId64, Operand * P.codeAlign());
+    else
+      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
+    if (Address && P.codeAlign()) {
+      *Address += Operand * P.codeAlign();
+      OS << format(" to 0x%" PRIx64, *Address);
+    }
+    break;
+  case CFIProgram::OT_SignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, int64_t(Operand) * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
+    break;
+  case CFIProgram::OT_UnsignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, Operand * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
+    break;
+  case CFIProgram::OT_Register:
+    OS << ' ';
+    printRegister(OS, DumpOpts, Operand);
+    break;
+  case CFIProgram::OT_AddressSpace:
+    OS << format(" in addrspace%" PRId64, Operand);
+    break;
+  case CFIProgram::OT_Expression:
+    assert(Instr.Expression && "missing DWARFExpression object");
+    OS << " ";
+    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
+                                  nullptr);
+    break;
+  }
+}
+
+void llvm::dwarf::printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                                  const DIDumpOptions &DumpOpts,
+                                  unsigned IndentLevel,
+                                  std::optional<uint64_t> Address) {
+  for (const auto &Instr : P) {
+    uint8_t Opcode = Instr.Opcode;
+    OS.indent(2 * IndentLevel);
+    OS << P.callFrameString(Opcode) << ":";
+    for (size_t i = 0; i < Instr.Ops.size(); ++i)
+      printOperand(OS, DumpOpts, P, Instr, i, Instr.Ops[i], Address);
+    OS << '\n';
+  }
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
index 8d25599627c4a..365b26b98a1e3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
@@ -23,18 +23,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-static void printRegister(raw_ostream &OS, DIDumpOptions DumpOpts,
-                          unsigned RegNum) {
-  if (DumpOpts.GetNameForDWARFReg) {
-    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
-    if (!RegName.empty()) {
-      OS << RegName;
-      return;
-    }
-  }
-  OS << "reg" << RegNum;
-}
-
 // See DWARF standard v3, section 7.23
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
@@ -361,85 +349,3 @@ CFIProgram::getOperandTypes() {
 
   return ArrayRef<OperandType[MaxOperands]>(&OpTypes[0], DW_CFA_restore + 1);
 }
-
-/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-void CFIProgram::printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                              const Instruction &Instr, unsigned OperandIdx,
-                              uint64_t Operand,
-                              std::optional<uint64_t> &Address) const {
-  assert(OperandIdx < MaxOperands);
-  uint8_t Opcode = Instr.Opcode;
-  OperandType Type = getOperandTypes()[Opcode][OperandIdx];
-
-  switch (Type) {
-  case OT_Unset: {
-    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
-    auto OpcodeName = callFrameString(Opcode);
-    if (!OpcodeName.empty())
-      OS << " " << OpcodeName;
-    else
-      OS << format(" Opcode %x", Opcode);
-    break;
-  }
-  case OT_None:
-    break;
-  case OT_Address:
-    OS << format(" %" PRIx64, Operand);
-    Address = Operand;
-    break;
-  case OT_Offset:
-    // The offsets are all encoded in a unsigned form, but in practice
-    // consumers use them signed. It's most certainly legacy due to
-    // the lack of signed variants in the first Dwarf standards.
-    OS << format(" %+" PRId64, int64_t(Operand));
-    break;
-  case OT_FactoredCodeOffset: // Always Unsigned
-    if (CodeAlignmentFactor)
-      OS << format(" %" PRId64, Operand * CodeAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
-    if (Address && CodeAlignmentFactor) {
-      *Address += Operand * CodeAlignmentFactor;
-      OS << format(" to 0x%" PRIx64, *Address);
-    }
-    break;
-  case OT_SignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, int64_t(Operand) * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
-    break;
-  case OT_UnsignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, Operand * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
-    break;
-  case OT_Register:
-    OS << ' ';
-    printRegister(OS, DumpOpts, Operand);
-    break;
-  case OT_AddressSpace:
-    OS << format(" in addrspace%" PRId64, Operand);
-    break;
-  case OT_Expression:
-    assert(Instr.Expression && "missing DWARFExpression object");
-    OS << " ";
-    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
-                                  nullptr);
-    break;
-  }
-}
-
-void CFIProgram::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                      unsigned IndentLevel,
-                      std::optional<uint64_t> Address) const {
-  for (const auto &Instr : Instructions) {
-    uint8_t Opcode = Instr.Opcode;
-    OS.indent(2 * IndentLevel);
-    OS << callFrameString(Opcode) << ":";
-    for (unsigned i = 0; i < Instr.Ops.size(); ++i)
-      printOperand(OS, DumpOpts, Instr, i, Instr.Ops[i], Address);
-    OS << '\n';
-  }
-}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index c46b14b4446f7..9dff925073dbe 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -602,7 +603,8 @@ void CIE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     OS << "\n";
   }
   OS << "\n";
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, /*InitialLocation=*/{});
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1,
+                  /*InitialLocation=*/{});
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
@@ -630,7 +632,7 @@ void FDE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   OS << "  Format:       " << FormatString(IsDWARF64) << "\n";
   if (LSDAAddress)
     OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 765a3bcbed7e2..a1ddb318055be 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -15,111 +15,46 @@
 #include "llvm/ADT/bit.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
 namespace hlsl {
 namespace rootsig {
 
-static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
-  switch (Reg.ViewType) {
-  case RegisterType::BReg:
-    OS << "b";
-    break;
-  case RegisterType::TReg:
-    OS << "t";
-    break;
-  case RegisterType::UReg:
-    OS << "u";
-    break;
-  case RegisterType::SReg:
-    OS << "s";
-    break;
-  }
-  OS << Reg.Number;
-  return OS;
-}
-
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const ShaderVisibility &Visibility) {
-  switch (Visibility) {
-  case ShaderVisibility::All:
-    OS << "All";
-    break;
-  case ShaderVisibility::Vertex:
-    OS << "Vertex";
-    break;
-  case ShaderVisibility::Hull:
-    OS << "Hull";
-    break;
-  case ShaderVisibility::Domain:
-    OS << "Domain";
-    break;
-  case ShaderVisibility::Geometry:
-    OS << "Geometry";
-    break;
-  case ShaderVisibility::Pixel:
-    OS << "Pixel";
-    break;
-  case ShaderVisibility::Amplification:
-    OS << "Amplification";
-    break;
-  case ShaderVisibility::Mesh:
-    OS << "Mesh";
-    break;
-  }
-
-  return OS;
+template <typename T>
+static std::optional<StringRef> getEnumName(const T Value,
+                                            ArrayRef<EnumEntry<T>> Enums) {
+  for (const auto &EnumItem : Enums)
+    if (EnumItem.Value == Value)
+      return EnumItem.Name;
+  return std::nullopt;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
-  switch (Type) {
-  case ClauseType::CBuffer:
-    OS << "CBV";
-    break;
-  case ClauseType::SRV:
-    OS << "SRV";
-    break;
-  case ClauseType::UAV:
-    OS << "UAV";
-    break;
-  case ClauseType::Sampler:
-    OS << "Sampler";
-    break;
-  }
-
+template <typename T>
+static raw_ostream &printEnum(raw_ostream &OS, const T Value,
+                              ArrayRef<EnumEntry<T>> Enums) {
+  auto MaybeName = getEnumName(Value, Enums);
+  if (MaybeName)
+    OS << *MaybeName;
   return OS;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const DescriptorRangeFlags &Flags) {
+template <typename T>
+static raw_ostream &printFlags(raw_ostream &OS, const T Value,
+                               ArrayRef<EnumEntry<T>> Flags) {
   bool FlagSet = false;
-  unsigned Remaining = llvm::to_underlying(Flags);
+  unsigned Remaining = llvm::to_underlying(Value);
   while (Remaining) {
     unsigned Bit = 1u << llvm::countr_zero(Remaining);
     if (Remaining & Bit) {
       if (FlagSet)
         OS << " | ";
 
-      switch (static_cast<DescriptorRangeFlags>(Bit)) {
-      case DescriptorRangeFlags::DescriptorsVolatile:
-        OS << "DescriptorsVolatile";
-        break;
-      case DescriptorRangeFlags::DataVolatile:
-        OS << "DataVolatile";
-        break;
-      case DescriptorRangeFlags::DataStaticWhileSetAtExecute:
-        OS << "DataStaticWhileSetAtExecute";
-        break;
-      case DescriptorRangeFlags::DataStatic:
-        OS << "DataStatic";
-        break;
-      case DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks:
-        OS << "DescriptorsStaticKeepingBufferBoundsChecks";
-        break;
-      default:
+      auto MaybeFlag = getEnumName(T(Bit), Flags);
+      if (MaybeFlag)
+        OS << *MaybeFlag;
+      else
         OS << "invalid: " << Bit;
-        break;
-      }
 
       FlagSet = true;
     }
@@ -128,6 +63,218 @@ static raw_ostream &operator<<(raw_ostream &OS,
 
   if (!FlagSet)
     OS << "None";
+  return OS;
+}
+
+static const EnumEntry<RegisterType> RegisterNames[] = {
+    {"b", RegisterType::BReg},
+    {"t", RegisterType::TReg},
+    {"u", RegisterType::UReg},
+    {"s", RegisterType::SReg},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
+  printEnum(OS, Reg.ViewType, ArrayRef(RegisterNames));
+  OS << Reg.Number;
+
+  return OS;
+}
+
+static const EnumEntry<ShaderVisibility> VisibilityNames[] = {
+    {"All", ShaderVisibility::All},
+    {"Vertex", ShaderVisibility::Vertex},
+    {"Hull", ShaderVisibility::Hull},
+    {"Domain", ShaderVisibility::Domain},
+    {"Geometry", ShaderVisibility::Geometry},
+    {"Pixel", ShaderVisibility::Pixel},
+    {"Amplification", ShaderVisibility::Amplification},
+    {"Mesh", ShaderVisibility::Mesh},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ShaderVisibility &Visibility) {
+  printEnum(OS, Visibility, ArrayRef(VisibilityNames));
+
+  return OS;
+}
+
+static const EnumEntry<SamplerFilter> SamplerFilterNames[] = {
+    {"MinMagMipPoint", SamplerFilter::MinMagMipPoint},
+    {"MinMagPointMipLinear", SamplerFilter::MinMagPointMipLinear},
+    {"MinPointMagLinearMipPoint", SamplerFilter::MinPointMagLinearMipPoint},
+    {"MinPointMagMipLinear", SamplerFilter::MinPointMagMipLinear},
+    {"MinLinearMagMipPoint", SamplerFilter::MinLinearMagMipPoint},
+    {"MinLinearMagPointMipLinear", SamplerFilter::MinLinearMagPointMipLinear},
+    {"MinMagLinearMipPoint", SamplerFilter::MinMagLinearMipPoint},
+    {"MinMagMipLinear", SamplerFilter::MinMagMipLinear},
+    {"Anisotropic", SamplerFilter::Anisotropic},
+    {"ComparisonMinMagMipPoint", SamplerFilter::ComparisonMinMagMipPoint},
+    {"ComparisonMinMagPointMipLinear",
+     SamplerFilter::ComparisonMinMagPointMipLinear},
+    {"ComparisonMinPointMagLinearMipPoint",
+     SamplerFilter::ComparisonMinPointMagLinearMipPoint},
+    {"ComparisonMinPointMagMipLinear",
+     SamplerFilter::ComparisonMinPointMagMipLinear},
+    {"ComparisonMinLinearMagMipPoint",
+     SamplerFilter::ComparisonMinLinearMagMipPoint},
+    {"ComparisonMinLinearMagPointMipLinear",
+     SamplerFilter::ComparisonMinLinearMagPointMipLinear},
+    {"ComparisonMinMagLinearMipPoint",
+     SamplerFilter::ComparisonMinMagLinearMipPoint},
+    {"ComparisonMinMagMipLinear", SamplerFilter::ComparisonMinMagMipLinear},
+    {"ComparisonAnisotropic", SamplerFilter::ComparisonAnisotropic},
+    {"MinimumMinMagMipPoint", SamplerFilter::MinimumMinMagMipPoint},
+    {"MinimumMinMagPointMipLinear", SamplerFilter::MinimumMinMagPointMipLinear},
+    {"MinimumMinPointMagLinearMipPoint",
+     SamplerFilter::MinimumMinPointMagLinearMipPoint},
+    {"MinimumMinPointMagMipLinear", SamplerFilter::MinimumMinPointMagMipLinear},
+    {"MinimumMinLinearMagMipPoint", SamplerFilter::MinimumMinLinearMagMipPoint},
+    {"MinimumMinLinearMagPointMipLinear",
+     SamplerFilter::MinimumMinLinearMagPointMipLinear},
+    {"MinimumMinMagLinearMipPoint", SamplerFilter::MinimumMinMagLinearMipPoint},
+    {"MinimumMinMagMipLinear", SamplerFilter::MinimumMinMagMipLinear},
+    {"MinimumAnisotropic", SamplerFilter::MinimumAnisotropic},
+    {"MaximumMinMagMipPoint", SamplerFilter::MaximumMinMagMipPoint},
+    {"MaximumMinMagPointMipLinear", SamplerFilter::MaximumMinMagPointMipLinear},
+    {"MaximumMinPointMagLinearMipPoint",
+     SamplerFilter::MaximumMinPointMagLinearMipPoint},
+    {"MaximumMinPointMagMipLinear", SamplerFilter::MaximumMinPointMagMipLinear},
+    {"MaximumMinLinearMagMipPoint", SamplerFilter::MaximumMinLinearMagMipPoint},
+    {"MaximumMinLinearMagPointMipLinear",
+     SamplerFilter::MaximumMinLinearMagPointMipLinear},
+    {"MaximumMinMagLinearMipPoint", SamplerFilter::MaximumMinMagLinearMipPoint},
+    {"MaximumMinMagMipLinear", SamplerFilter::MaximumMinMagMipLinear},
+    {"MaximumAnisotropic", SamplerFilter::MaximumAnisotropic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const SamplerFilter &Filter) {
+  printEnum(OS, Filter, ArrayRef(SamplerFilterNames));
+
+  return OS;
+}
+
+static const EnumEntry<TextureAddressMode> TextureAddressModeNames[] = {
+    {"Wrap", TextureAddressMode::Wrap},
+    {"Mirror", TextureAddressMode::Mirror},
+    {"Clamp", TextureAddressMode::Clamp},
+    {"Border", TextureAddressMode::Border},
+    {"MirrorOnce", TextureAddressMode::MirrorOnce},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const TextureAddressMode &Address) {
+  printEnum(OS, Address, ArrayRef(TextureAddressModeNames));
+
+  return OS;
+}
+
+static const EnumEntry<ComparisonFunc> ComparisonFuncNames[] = {
+    {"Never", ComparisonFunc::Never},
+    {"Less", ComparisonFunc::Less},
+    {"Equal", ComparisonFunc::Equal},
+    {"LessEqual", ComparisonFunc::LessEqual},
+    {"Greater", ComparisonFunc::Greater},
+    {"NotEqual", ComparisonFunc::NotEqual},
+    {"GreaterEqual", ComparisonFunc::GreaterEqual},
+    {"Always", ComparisonFunc::Always},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ComparisonFunc &CompFunc) {
+  printEnum(OS, CompFunc, ArrayRef(ComparisonFuncNames));
+
+  return OS;
+}
+
+static const EnumEntry<StaticBorderColor> StaticBorderColorNames[] = {
+    {"TransparentBlack", StaticBorderColor::TransparentBlack},
+    {"OpaqueBlack", StaticBorderColor::OpaqueBlack},
+    {"OpaqueWhite", StaticBorderColor::OpaqueWhite},
+    {"OpaqueBlackUint", StaticBorderColor::OpaqueBlackUint},
+    {"OpaqueWhiteUint", StaticBorderColor::OpaqueWhiteUint},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const StaticBorderColor &BorderColor) {
+  printEnum(OS, BorderColor, ArrayRef(StaticBorderColorNames));
+
+  return OS;
+}
+
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+    {"CBV", dxil::ResourceClass::CBuffer},
+    {"SRV", dxil::ResourceClass::SRV},
+    {"UAV", dxil::ResourceClass::UAV},
+    {"Sampler", dxil::ResourceClass::Sampler},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
+  printEnum(OS, dxil::ResourceClass(llvm::to_underlying(Type)),
+            ArrayRef(ResourceClassNames));
+
+  return OS;
+}
+
+static const EnumEntry<RootDescriptorFlags> RootDescriptorFlagNames[] = {
+    {"DataVolatile", RootDescriptorFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     RootDescriptorFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", RootDescriptorFlags::DataStatic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const RootDescriptorFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(RootDescriptorFlagNames));
+
+  return OS;
+}
+
+static const EnumEntry<DescriptorRangeFlags> DescriptorRangeFlagNames[] = {
+    {"DescriptorsVolatile", DescriptorRangeFlags::DescriptorsVolatile},
+    {"DataVolatile", DescriptorRangeFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     DescriptorRangeFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", DescriptorRangeFlags::DataStatic},
+    {"DescriptorsStaticKeepingBufferBoundsChecks",
+     DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const DescriptorRangeFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(DescriptorRangeFlagNames));
+
+  return OS;
+}
+
+static const EnumEntry<RootFlags> RootFlagNames[] = {
+    {"AllowInputAssemblerInputLayout",
+     RootFlags::AllowInputAssemblerInputLayout},
+    {"DenyVertexShaderRootAccess", RootFlags::DenyVertexShaderRootAccess},
+    {"DenyHullShaderRootAccess", RootFlags::DenyHullShaderRootAccess},
+    {"DenyDomainShaderRootAccess", RootFlags::DenyDomainShaderRootAccess},
+    {"DenyGeometryShaderRootAccess", RootFlags::DenyGeometryShaderRootAccess},
+    {"DenyPixelShaderRootAccess", RootFlags::DenyPixelShaderRootAccess},
+    {"AllowStreamOutput", RootFlags::AllowStreamOutput},
+    {"LocalRootSignature", RootFlags::LocalRootSignature},
+    {"DenyAmplificationShaderRootAccess",
+     RootFlags::DenyAmplificationShaderRootAccess},
+    {"DenyMeshShaderRootAccess", RootFlags::DenyMeshShaderRootAccess},
+    {"CBVSRVUAVHeapDirectlyIndexed", RootFlags::CBVSRVUAVHeapDirectlyIndexed},
+    {"SamplerHeapDirectlyIndexed", RootFlags::SamplerHeapDirectlyIndexed},
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags) {
+  OS << "RootFlags(";
+  printFlags(OS, Flags, ArrayRef(RootFlagNames));
+  OS << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const RootConstants &Constants) {
+  OS << "RootConstants(num32BitConstants = " << Constants.Num32BitConstants
+     << ", " << Constants.Reg << ", space = " << Constants.Space
+     << ", visibility = " << Constants.Visibility << ")";
 
   return OS;
 }
@@ -152,6 +299,31 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) {
   return OS;
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) {
+  ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type));
+  OS << "Root" << Type << "(" << Descriptor.Reg
+     << ", space = " << Descriptor.Space
+     << ", visibility = " << Descriptor.Visibility
+     << ", flags = " << Descriptor.Flags << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const StaticSampler &Sampler) {
+  OS << "StaticSampler(" << Sampler.Reg << ", filter = " << Sampler.Filter
+     << ", addressU = " << Sampler.AddressU
+     << ", addressV = " << Sampler.AddressV
+     << ", addressW = " << Sampler.AddressW
+     << ", mipLODBias = " << Sampler.MipLODBias
+     << ", maxAnisotropy = " << Sampler.MaxAnisotropy
+     << ", comparisonFunc = " << Sampler.CompFunc
+     << ", borderColor = " << Sampler.BorderColor
+     << ", minLOD = " << Sampler.MinLOD << ", maxLOD = " << Sampler.MaxLOD
+     << ", space = " << Sampler.Space << ", visibility = " << Sampler.Visibility
+     << ")";
+  return OS;
+}
+
 void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements) {
   OS << "RootElements{";
   bool First = true;
@@ -236,12 +408,13 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
 
 MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
-  llvm::SmallString<7> Name;
-  llvm::raw_svector_ostream OS(Name);
-  OS << "Root" << ClauseType(llvm::to_underlying(Descriptor.Type));
-
+  std::optional<StringRef> TypeName =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Descriptor.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(TypeName && "Provided an invalid Resource Class");
+  llvm::SmallString<7> Name({"Root", *TypeName});
   Metadata *Operands[] = {
-      MDString::get(Ctx, OS.str()),
+      MDString::get(Ctx, Name),
       ConstantAsMetadata::get(
           Builder.getInt32(llvm::to_underlying(Descriptor.Visibility))),
       ConstantAsMetadata::get(Builder.getInt32(Descriptor.Reg.Number)),
@@ -277,19 +450,20 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
 MDNode *MetadataBuilder::BuildDescriptorTableClause(
     const DescriptorTableClause &Clause) {
   IRBuilder<> Builder(Ctx);
-  std::string Name;
-  llvm::raw_string_ostream OS(Name);
-  OS << Clause.Type;
-  return MDNode::get(
-      Ctx, {
-               MDString::get(Ctx, OS.str()),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
-               ConstantAsMetadata::get(
-                   Builder.getInt32(llvm::to_underlying(Clause.Flags))),
-           });
+  std::optional<StringRef> Name =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Clause.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(Name && "Provided an invalid Resource Class");
+  Metadata *Operands[] = {
+      MDString::get(Ctx, *Name),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
+      ConstantAsMetadata::get(
+          Builder.getInt32(llvm::to_underlying(Clause.Flags))),
+  };
+  return MDNode::get(Ctx, Operands);
 }
 
 MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
@@ -323,6 +497,67 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
   return MDNode::get(Ctx, Operands);
 }
 
+std::optional<const RangeInfo *>
+ResourceRange::getOverlapping(const RangeInfo &Info) const {
+  MapT::const_iterator Interval = Intervals.find(Info.LowerBound);
+  if (!Interval.valid() || Info.UpperBound < Interval.start())
+    return std::nullopt;
+  return Interval.value();
+}
+
+const RangeInfo *ResourceRange::lookup(uint32_t X) const {
+  return Intervals.lookup(X, nullptr);
+}
+
+std::optional<const RangeInfo *> ResourceRange::insert(const RangeInfo &Info) {
+  uint32_t LowerBound = Info.LowerBound;
+  uint32_t UpperBound = Info.UpperBound;
+
+  std::optional<const RangeInfo *> Res = std::nullopt;
+  MapT::iterator Interval = Intervals.begin();
+
+  while (true) {
+    if (UpperBound < LowerBound)
+      break;
+
+    Interval.advanceTo(LowerBound);
+    if (!Interval.valid()) // No interval found
+      break;
+
+    // Let Interval = [x;y] and [LowerBound;UpperBound] = [a;b] and note that
+    // a <= y implicitly from Intervals.find(LowerBound)
+    if (UpperBound < Interval.start())
+      break; // found interval does not overlap with inserted one
+
+    if (!Res.has_value()) // Update to be the first found intersection
+      Res = Interval.value();
+
+    if (Interval.start() <= LowerBound && UpperBound <= Interval.stop()) {
+      // x <= a <= b <= y implies that [a;b] is covered by [x;y]
+      //  -> so we don't need to insert this, report an overlap
+      return Res;
+    } else if (LowerBound <= Interval.start() &&
+               Interval.stop() <= UpperBound) {
+      // a <= x <= y <= b implies that [x;y] is covered by [a;b]
+      //  -> so remove the existing interval that we will cover with the
+      //  overwrite
+      Interval.erase();
+    } else if (LowerBound < Interval.start() && UpperBound <= Interval.stop()) {
+      // a < x <= b <= y implies that [a; x] is not covered but [x;b] is
+      //  -> so set b = x - 1 such that [a;x-1] is now the interval to insert
+      UpperBound = Interval.start() - 1;
+    } else if (Interval.start() <= LowerBound && Interval.stop() < UpperBound) {
+      // a < x <= b <= y implies that [y; b] is not covered but [a;y] is
+      //  -> so set a = y + 1 such that [y+1;b] is now the interval to insert
+      LowerBound = Interval.stop() + 1;
+    }
+  }
+
+  assert(LowerBound <= UpperBound && "Attempting to insert an empty interval");
+  Intervals.insert(LowerBound, UpperBound, &Info);
+  return Res;
+}
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ca3d8438654dc..7cbbbff511c88 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4184,7 +4184,11 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
     Value *IndVar = Builder.CreateAdd(Span, Start);
     return BodyGenCB(Builder.saveIP(), IndVar);
   };
-  LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
+  LocationDescription LoopLoc =
+      ComputeIP.isSet()
+          ? Loc
+          : LocationDescription(Builder.saveIP(),
+                                Builder.getCurrentDebugLocation());
   return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
 }
 
@@ -6703,7 +6707,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
                             /*TargetTaskAllocaIP=*/{}));
       else
         cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
-                                /*Dependencies=*/{}, Info.HasNoWait));
+                                /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
     } else {
       Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
           omp::OMPRTL___tgt_target_data_begin_mapper);
@@ -7150,15 +7154,55 @@ static Expected<Function *> createOutlinedFunction(
                                     ValueReplacementMap);
   return Func;
 }
+/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
+/// of pointers containing shared data between the parent task and the created
+/// task.
+static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
+                                                  IRBuilderBase &Builder,
+                                                  Value *TaskWithPrivates,
+                                                  Type *TaskWithPrivatesTy) {
 
+  Type *TaskTy = OMPIRBuilder.Task;
+  LLVMContext &Ctx = Builder.getContext();
+  Value *TaskT =
+      Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
+  Value *Shareds = TaskT;
+  // TaskWithPrivatesTy can be one of the following
+  // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+  //                                        %struct.privates }
+  // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
+  //
+  // In the former case, that is when  TaskWithPrivatesTy != TaskTy,
+  // its first member has to be the task descriptor. TaskTy is the type of the
+  // task descriptor. TaskT is the pointer to the task descriptor. Loading the
+  // first member of TaskT, gives us the pointer to shared data.
+  if (TaskWithPrivatesTy != TaskTy)
+    Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+  return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+}
 /// Create an entry point for a target task with the following.
 /// It'll have the following signature
 /// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
 /// This function is called from emitTargetTask once the
 /// code to launch the target kernel has been outlined already.
-static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
-                                             IRBuilderBase &Builder,
-                                             CallInst *StaleCI) {
+/// NumOffloadingArrays is the number of offloading arrays that we need to copy
+/// into the task structure so that the deferred target task can access this
+/// data even after the stack frame of the generating task has been rolled
+/// back. Offloading arrays contain base pointers, pointers, sizes etc
+/// of the data that the target kernel will access. These in effect are the
+/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
+static Function *emitTargetTaskProxyFunction(
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
+    StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
+    const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
+
+  // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
+  // This is because PrivatesTy is the type of the structure in which
+  // we pass the offloading arrays to the deferred target task.
+  assert((!NumOffloadingArrays || PrivatesTy) &&
+         "PrivatesTy cannot be nullptr when there are offloadingArrays"
+         "to privatize");
+
   Module &M = OMPBuilder.M;
   // KernelLaunchFunction is the target launch function, i.e.
   // the function that sets up kernel arguments and calls
@@ -7185,34 +7229,49 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
   // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
   OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
                                     StaleCI->getIterator());
+
   LLVMContext &Ctx = StaleCI->getParent()->getContext();
+
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
   Type *TaskPtrTy = OMPBuilder.TaskPtr;
-  Type *TaskTy = OMPBuilder.Task;
+  [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
+
   auto ProxyFnTy =
       FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
                         /* isVarArg */ false);
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
-  ProxyFn->getArg(0)->setName("thread.id");
-  ProxyFn->getArg(1)->setName("task");
+  Value *ThreadId = ProxyFn->getArg(0);
+  Value *TaskWithPrivates = ProxyFn->getArg(1);
+  ThreadId->setName("thread.id");
+  TaskWithPrivates->setName("task");
 
+  bool HasShareds = SharedArgsOperandNo > 0;
+  bool HasOffloadingArrays = NumOffloadingArrays > 0;
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
   Builder.SetInsertPoint(EntryBB);
 
-  bool HasShareds = StaleCI->arg_size() > 1;
-  // TODO: This is a temporary assert to prove to ourselves that
-  // the outlined target launch function is always going to have
-  // atmost two arguments if there is any data shared between
-  // host and device.
-  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
-         "StaleCI with shareds should have exactly two arguments.");
+  SmallVector<Value *> KernelLaunchArgs;
+  KernelLaunchArgs.reserve(StaleCI->arg_size());
+  KernelLaunchArgs.push_back(ThreadId);
+
+  if (HasOffloadingArrays) {
+    assert(TaskTy != TaskWithPrivatesTy &&
+           "If there are offloading arrays to pass to the target"
+           "TaskTy cannot be the same as TaskWithPrivatesTy");
+    (void)TaskTy;
+    Value *Privates =
+        Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
+    for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
+      KernelLaunchArgs.push_back(
+          Builder.CreateStructGEP(PrivatesTy, Privates, i));
+  }
 
-  Value *ThreadId = ProxyFn->getArg(0);
   if (HasShareds) {
-    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    auto *ArgStructAlloca =
+        dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
     assert(ArgStructAlloca &&
            "Unable to find the alloca instruction corresponding to arguments "
            "for extracted function");
@@ -7220,27 +7279,67 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
 
     AllocaInst *NewArgStructAlloca =
         Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
-    Value *TaskT = ProxyFn->getArg(1);
+
     Value *SharedsSize =
         Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
 
-    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
-    LoadInst *LoadShared =
-        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+    LoadInst *LoadShared = loadSharedDataFromTaskDescriptor(
+        OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
 
     Builder.CreateMemCpy(
         NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
         LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
-
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
-  } else {
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId});
+    KernelLaunchArgs.push_back(NewArgStructAlloca);
   }
-
+  Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
   Builder.CreateRetVoid();
   return ProxyFn;
 }
+static Type *getOffloadingArrayType(Value *V) {
 
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
+    return GEP->getSourceElementType();
+  if (auto *Alloca = dyn_cast<AllocaInst>(V))
+    return Alloca->getAllocatedType();
+
+  llvm_unreachable("Unhandled Instruction type");
+  return nullptr;
+}
+// This function returns a struct that has at most two members.
+// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
+// descriptor. The second member, if needed, is a struct containing arrays
+// that need to be passed to the offloaded target kernel. For example,
+// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
+// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
+// respectively, then the types created  by this function are
+//
+// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
+// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+//                                     %struct.privates }
+// %struct.task_with_privates is returned by this function.
+// If there aren't any offloading arrays to pass to the target kernel,
+// %struct.kmp_task_ompbuilder_t is returned.
+static StructType *
+createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
+                         ArrayRef<Value *> OffloadingArraysToPrivatize) {
+
+  if (OffloadingArraysToPrivatize.empty())
+    return OMPIRBuilder.Task;
+
+  SmallVector<Type *, 4> StructFieldTypes;
+  for (Value *V : OffloadingArraysToPrivatize) {
+    assert(V->getType()->isPointerTy() &&
+           "Expected pointer to array to privatize. Got a non-pointer value "
+           "instead");
+    Type *ArrayTy = getOffloadingArrayType(V);
+    assert(ArrayTy && "ArrayType cannot be nullptr");
+    StructFieldTypes.push_back(ArrayTy);
+  }
+  StructType *PrivatesStructTy =
+      StructType::create(StructFieldTypes, "struct.privates");
+  return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
+                            "struct.task_with_privates");
+}
 static Error emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
     TargetRegionEntryInfo &EntryInfo,
@@ -7266,7 +7365,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
     OpenMPIRBuilder::InsertPointTy AllocaIP,
     const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
+    const TargetDataRTArgs &RTArgs, bool HasNoWait) {
 
   // The following explains the code-gen scenario for the `target` directive. A
   // similar scneario is followed for other device-related directives (e.g.
@@ -7276,27 +7375,30 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // When we arrive at this function, the target region itself has been
   // outlined into the function OutlinedFn.
   // So at ths point, for
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //   void user_code_that_offloads(...) {
-  //     omp target depend(..) map(from:a) map(to:b, c)
-  //        a = b + c
+  //     omp target depend(..) map(from:a) map(to:b) private(i)
+  //     do i = 1, 10
+  //        a(i) = b(i) + n
   //   }
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   // we have
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_mappers = alloca [2 x ptr], align 8
   //     ;; target region has been outlined and now we need to
   //     ;; offload to it via a target task.
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //     *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   // We have to now do the following
@@ -7309,33 +7411,59 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // (iii) Create a task with the task entry point created in (ii)
   //
   // That is we create the following
-  //
+  //   struct task_with_privates {
+  //      struct kmp_task_ompbuilder_t task_struct;
+  //      struct privates {
+  //         [2 x ptr] ; baseptrs
+  //         [2 x ptr] ; ptrs
+  //         [2 x i64] ; sizes
+  //      }
+  //   }
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_sizes = alloca [2 x i64], align 8
   //
   //     %structArg = alloca { ptr, ptr, ptr }, align 8
-  //     %strucArg[0] = %.offload_baseptrs
-  //     %strucArg[1] = %.offload_ptrs
-  //     %strucArg[2] = %.offload_mappers
-  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
-  //                                               @.omp_target_task_proxy_func)
-  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     %strucArg[0] = a
+  //     %strucArg[1] = b
+  //     %strucArg[2] = &n
+  //
+  //     target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
+  //                                               sizeof(kmp_task_ompbuilder_t),
+  //                                               sizeof(structArg),
+  //                                               @.omp_target_task_proxy_func,
+  //                                               ...)
+  //     memcpy(target_task_with_privates->task_struct->shareds, %structArg,
+  //            sizeof(structArg))
+  //     memcpy(target_task_with_privates->privates->baseptrs,
+  //            offload_baseptrs, sizeof(offload_baseptrs)
+  //     memcpy(target_task_with_privates->privates->ptrs,
+  //            offload_ptrs, sizeof(offload_ptrs)
+  //     memcpy(target_task_with_privates->privates->sizes,
+  //            offload_sizes, sizeof(offload_sizes)
   //     dependencies_array = ...
   //     ;; if nowait not present
   //     call @__kmpc_omp_wait_deps(..., dependencies_array)
   //     call @__kmpc_omp_task_begin_if0(...)
   //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
-  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+  //     %target_task_with_privates)
+  //     call @__kmpc_omp_task_complete_if0(...)
   //   }
   //
   //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
   //                                                     ptr %task) {
   //       %structArg = alloca {ptr, ptr, ptr}
-  //       %shared_data = load (getelementptr %task, 0, 0)
-  //       mempcy(%structArg, %shared_data, sizeof(structArg))
-  //       kernel_launch_function(%thread.id, %structArg)
+  //       %task_ptr = getelementptr(%task, 0, 0)
+  //       %shared_data = load (getelementptr %task_ptr, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(%structArg))
+  //
+  //       %offloading_arrays = getelementptr(%task, 0, 1)
+  //       %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
+  //       %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
+  //       %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
+  //       kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
+  //                              %offload_sizes, %structArg)
   //   }
   //
   //   We need the proxy function because the signature of the task entry point
@@ -7343,21 +7471,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   //   that of the kernel_launch function.
   //
   //   kernel_launch_function is generated by emitKernelLaunch and has the
-  //   always_inline attribute.
-  //   void kernel_launch_function(thread_id,
-  //                               structArg) alwaysinline {
+  //   always_inline attribute. For this example, it'll look like so:
+  //   void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
+  //                               %offload_sizes,  %structArg) alwaysinline {
   //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
-  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
-  //       offload_ptrs = load(getelementptr structArg, 0, 1)
-  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; load aggregated data from %structArg
   //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
-  //       ; offload_mappers
+  //       ; offload_sizes
   //       call i32 @__tgt_target_kernel(...,
   //                                     outlined_device_function,
   //                                     ptr %kernel_args)
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //      *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   BasicBlock *TargetTaskBodyBB =
@@ -7378,6 +7506,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
 
+  // Generate the task body which will subsequently be outlined.
   Builder.restoreIP(TargetTaskBodyIP);
   if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
     return Err;
@@ -7396,15 +7525,57 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
             /*IsFinished=*/true);
 
-  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
-                      DeviceID](Function &OutlinedFn) mutable {
+  SmallVector<Value *, 2> OffloadingArraysToPrivatize;
+  bool NeedsTargetTask = HasNoWait && DeviceID;
+  if (NeedsTargetTask) {
+    for (auto *V :
+         {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
+          RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
+          RTArgs.SizesArray}) {
+      if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
+        OffloadingArraysToPrivatize.push_back(V);
+        OI.ExcludeArgsFromAggregate.push_back(V);
+      }
+    }
+  }
+  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
+                      DeviceID, OffloadingArraysToPrivatize](
+                         Function &OutlinedFn) mutable {
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
 
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    bool HasShareds = StaleCI->arg_size() > 1;
 
-    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
+    // The first argument of StaleCI is always the thread id.
+    // The next few arguments are the pointers to offloading arrays
+    // if any. (see OffloadingArraysToPrivatize)
+    // Finally, all other local values that are live-in into the outlined region
+    // end up in a structure whose pointer is passed as the last argument. This
+    // piece of data is passed in the "shared" field of the task structure. So,
+    // we know we have to pass shareds to the task if the number of arguments is
+    // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
+    // thread id. Further, for safety, we assert that the number of arguments of
+    // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
+    const unsigned int NumStaleCIArgs = StaleCI->arg_size();
+    bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
+    assert(
+        !HasShareds ||
+        NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
+            "Wrong number of arguments for StaleCI when shareds are present");
+    int SharedArgOperandNo =
+        HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
+
+    StructType *TaskWithPrivatesTy =
+        createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
+    StructType *PrivatesTy = nullptr;
+
+    if (!OffloadingArraysToPrivatize.empty())
+      PrivatesTy =
+          static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
+
+    Function *ProxyFn = emitTargetTaskProxyFunction(
+        *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
+        OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
 
     LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
                       << "\n");
@@ -7422,7 +7593,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // If `HasNoWait == true`, we call  @__kmpc_omp_target_task_alloc to provide
     // the DeviceID to the deferred task and also since
     // @__kmpc_omp_target_task_alloc creates an untied/async task.
-    bool NeedsTargetTask = HasNoWait && DeviceID;
     Function *TaskAllocFn =
         !NeedsTargetTask
             ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
@@ -7435,17 +7605,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     // Argument - `sizeof_kmp_task_t` (TaskSize)
     // Tasksize refers to the size in bytes of kmp_task_t data structure
-    // including private vars accessed in task.
-    // TODO: add kmp_task_t_with_privates (privates)
-    Value *TaskSize =
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
+    // plus any other data to be passed to the target task, if any, which
+    // is packed into a struct. kmp_task_t and the struct so created are
+    // packed into a wrapper struct whose type is TaskWithPrivatesTy.
+    Value *TaskSize = Builder.getInt64(
+        M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
 
     // Argument - `sizeof_shareds` (SharedsSize)
     // SharedsSize refers to the shareds array size in the kmp_task_t data
     // structure.
     Value *SharedsSize = Builder.getInt64(0);
     if (HasShareds) {
-      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      auto *ArgStructAlloca =
+          dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
       assert(ArgStructAlloca &&
              "Unable to find the alloca instruction corresponding to arguments "
              "for extracted function");
@@ -7483,13 +7655,34 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
 
+    Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
     if (HasShareds) {
-      Value *Shareds = StaleCI->getArgOperand(1);
-      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
-      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+      Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
+      Value *TaskShareds = loadSharedDataFromTaskDescriptor(
+          *this, Builder, TaskData, TaskWithPrivatesTy);
       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
                            SharedsSize);
     }
+    if (!OffloadingArraysToPrivatize.empty()) {
+      Value *Privates =
+          Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
+      for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
+        Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
+        [[maybe_unused]] Type *ArrayType =
+            getOffloadingArrayType(PtrToPrivatize);
+        assert(ArrayType && "ArrayType cannot be nullptr");
+
+        Type *ElementType = PrivatesTy->getElementType(i);
+        assert(ElementType == ArrayType &&
+               "ElementType should match ArrayType");
+        (void)ArrayType;
+
+        Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
+        Builder.CreateMemCpy(
+            Dst, Alignment, PtrToPrivatize, Alignment,
+            Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
+      }
+    }
 
     Value *DepArray = emitTaskDependencies(*this, Dependencies);
 
@@ -7635,9 +7828,10 @@ static void emitTargetCall(
         // Arguments that are intended to be directly forwarded to an
         // emitKernelLaunch call are pased as nullptr, since
         // OutlinedFnID=nullptr results in that call not being done.
+        OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
         return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
                                          /*RTLoc=*/nullptr, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, EmptyRTArgs, HasNoWait);
       }
       return EmitTargetCallFallbackCB(Builder.saveIP());
     }());
@@ -7649,6 +7843,7 @@ static void emitTargetCall(
   auto &&EmitTargetCallThen =
       [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
           OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+    Info.HasNoWait = HasNoWait;
     OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
     OpenMPIRBuilder::TargetDataRTArgs RTArgs;
     if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
@@ -7726,7 +7921,8 @@ static void emitTargetCall(
       // explicit generation of the target task.
       if (RequiresOuterTargetTask)
         return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, KArgs.RTArgs,
+                                         Info.HasNoWait);
 
       return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
                                          EmitTargetCallFallbackCB, KArgs,
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 8b3e91750f86c..3642e935397cb 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 
 #include "LLVMContextImpl.h"
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e6b1f76dfacf6..196fe294a274b 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -586,11 +586,6 @@ bool llvm::stripDebugInfo(Function &F) {
   DenseMap<MDNode *, MDNode *> LoopIDsMap;
   for (BasicBlock &BB : F) {
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
-      if (isa<DbgInfoIntrinsic>(&I)) {
-        I.eraseFromParent();
-        Changed = true;
-        continue;
-      }
       if (I.getDebugLoc()) {
         Changed = true;
         I.setDebugLoc(DebugLoc());
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0be6d55d724e0..ffeeeb6f1e4b0 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -11,11 +11,11 @@
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)),
       Kind(DebugLocKind::Normal) {}
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 //===----------------------------------------------------------------------===//
 // DebugLoc Implementation
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 0f1291b8bd8be..b94dcace5e3c7 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -211,6 +212,9 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
+  } else if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    raw_string_ostream OS(Val);
+    OS << "call " << II->getCalledFunction()->getName();
   } else if (auto *I = dyn_cast<Instruction>(V)) {
     Val = I->getOpcodeName();
   } else if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index a33ef9c7d4a17..0a8b26b5f3d83 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 37f4a72d8c20b..2d31481f62c67 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -917,3 +917,10 @@ StringRef Module::getTargetABIFromMD() {
     TargetABI = TargetABIMD->getString();
   return TargetABI;
 }
+
+WinX64EHUnwindV2Mode Module::getWinX64EHUnwindV2Mode() const {
+  Metadata *MD = getModuleFlag("winx64-eh-unwindv2");
+  if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
+    return static_cast<WinX64EHUnwindV2Mode>(CI->getZExtValue());
+  return WinX64EHUnwindV2Mode::Disabled;
+}
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d63d398e243f9..a57b089193462 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -18,6 +18,25 @@ static cl::opt<bool>
 
 static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
                                    const Triple &TT) {
+#define LCALLNAMES(A, B, N)                                                    \
+  Info.setLibcallName(A##N##_RELAX, #B #N "_relax");                           \
+  Info.setLibcallName(A##N##_ACQ, #B #N "_acq");                               \
+  Info.setLibcallName(A##N##_REL, #B #N "_rel");                               \
+  Info.setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
+#define LCALLNAME4(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
+#define LCALLNAME5(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2)                                                          \
+  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
+  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
+
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
 #define HANDLE_LIBCALL(code, name)                                             \
@@ -32,7 +51,18 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
+
+    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, #__aarch64_cas)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, #__aarch64_swp)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, #__aarch64_ldadd)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, #__aarch64_ldset)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, #__aarch64_ldclr)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, #__aarch64_ldeor)
   }
+
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
 }
 
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
@@ -220,11 +250,106 @@ static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
   // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
 }
 
+void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F32] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F64] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_PPCF128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_PPCF128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F32] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F64] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_PPCF128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F32] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F64] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_PPCF128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F32] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F64] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_PPCF128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F32] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F64] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_PPCF128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_PPCF128] = CmpInst::ICMP_NE;
+}
+
+static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
+                                    bool FiniteOnlyFuncs = false) {
+  Info.setLibcallName(RTLIB::REM_F128, "fmodf128");
+  Info.setLibcallName(RTLIB::FMA_F128, "fmaf128");
+  Info.setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+  Info.setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+  Info.setLibcallName(RTLIB::LOG_F128, "logf128");
+  Info.setLibcallName(RTLIB::LOG2_F128, "log2f128");
+  Info.setLibcallName(RTLIB::LOG10_F128, "log10f128");
+  Info.setLibcallName(RTLIB::EXP_F128, "expf128");
+  Info.setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+  Info.setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+  Info.setLibcallName(RTLIB::SIN_F128, "sinf128");
+  Info.setLibcallName(RTLIB::COS_F128, "cosf128");
+  Info.setLibcallName(RTLIB::TAN_F128, "tanf128");
+  Info.setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+  Info.setLibcallName(RTLIB::ASIN_F128, "asinf128");
+  Info.setLibcallName(RTLIB::ACOS_F128, "acosf128");
+  Info.setLibcallName(RTLIB::ATAN_F128, "atanf128");
+  Info.setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
+  Info.setLibcallName(RTLIB::SINH_F128, "sinhf128");
+  Info.setLibcallName(RTLIB::COSH_F128, "coshf128");
+  Info.setLibcallName(RTLIB::TANH_F128, "tanhf128");
+  Info.setLibcallName(RTLIB::POW_F128, "powf128");
+  Info.setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+  Info.setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+  Info.setLibcallName(RTLIB::RINT_F128, "rintf128");
+  Info.setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+  Info.setLibcallName(RTLIB::ROUND_F128, "roundf128");
+  Info.setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+  Info.setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+  Info.setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+  Info.setLibcallName(RTLIB::FMIN_F128, "fminf128");
+  Info.setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
+  Info.setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+  Info.setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+  Info.setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+  Info.setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+  Info.setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+  Info.setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  Info.setLibcallName(RTLIB::MODF_F128, "modff128");
+
+  if (FiniteOnlyFuncs) {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+  } else {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, nullptr);
+  }
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
-  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
-            nullptr);
+  initSoftFloatCmpLibcallPredicates();
+
+  initSoftFloatCmpLibcallPredicates();
 
 #define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
 #define LIBCALL_NO_NAME nullptr
@@ -232,62 +357,9 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
 
-  // Initialize calling conventions to their default.
-  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
-    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
-
   // Use the f128 variants of math functions on x86
-  if (TT.isX86() && TT.isGNUEnvironment()) {
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
-    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::TAN_F128, "tanf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::ASIN_F128, "asinf128");
-    setLibcallName(RTLIB::ACOS_F128, "acosf128");
-    setLibcallName(RTLIB::ATAN_F128, "atanf128");
-    setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
-    setLibcallName(RTLIB::SINH_F128, "sinhf128");
-    setLibcallName(RTLIB::COSH_F128, "coshf128");
-    setLibcallName(RTLIB::TANH_F128, "tanhf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
-    setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
-    setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
-    setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
-    setLibcallName(RTLIB::MODF_F128, "modff128");
-  }
+  if (TT.isX86() && TT.isGNUEnvironment())
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/true);
 
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
@@ -321,31 +393,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::OGT_F128, "__gtkf2");
     setLibcallName(RTLIB::UO_F128, "__unordkf2");
 
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+    // TODO: Do the finite only functions exist?
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/false);
 
     if (TT.isOSAIX()) {
       bool isPPC64 = TT.isPPC64();
@@ -379,7 +428,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
       break;
     }
 
-    if (darwinHasSinCos(TT)) {
+    if (darwinHasSinCosStret(TT)) {
       setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
       setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
       if (TT.isWatchABI()) {
@@ -423,8 +472,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::EXP10_F64, "__exp10");
   }
 
-  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
-      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
+  if (hasSinCos(TT)) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
     setLibcallName(RTLIB::SINCOS_F80, "sincosl");
@@ -488,7 +536,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     }
   }
 
-  if (TT.getArch() == Triple::ArchType::aarch64)
+  if (TT.isAArch64())
     setAArch64LibcallNames(*this, TT);
   else if (TT.isARM() || TT.isThumb())
     setARMLibcallNames(*this, TT);
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index e8eaf4619df51..ba672d2fc2ec0 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -157,17 +158,12 @@ void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
     Expr.print(OS, this);
 }
 
-void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
-                                   const MCSpecifierExpr &Expr) const {
-  // TODO: Switch to unreachable after all targets that use MCSpecifierExpr
-  // migrate to MCAsmInfo::printSpecifierExpr.
-  Expr.printImpl(OS, this);
-}
-
-bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &E,
                                           MCValue &Res,
                                           const MCAssembler *Asm) const {
-  // TODO: Remove after all targets that use MCSpecifierExpr migrate to
-  // MCAsmInfo::evaluateAsRelocatableImpl.
-  return Expr.evaluateAsRelocatableImpl(Res, Asm);
+  if (!E.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+
+  Res.setSpecifier(E.getSpecifier());
+  return !Res.getSubSym();
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 89191294f3ed3..8919a2627cf6a 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -754,12 +754,3 @@ const MCSpecifierExpr *MCSpecifierExpr::create(const MCSymbol *Sym, Spec S,
                                                MCContext &Ctx, SMLoc Loc) {
   return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S, Loc);
 }
-
-bool MCSpecifierExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                                const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-
-  Res.setSpecifier(specifier);
-  return !Res.getSubSym();
-}
diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 8ecd669e67178..93bc6631e64c8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,10 +116,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  // If we are emitting an encryptable binary, our load commands must have a
-  // separate (non-encrypted) page to themselves.
-  bool RequiresFirstSectionOutsideFirstPage =
-      O.EncryptionInfoCommandIndex.has_value();
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
@@ -173,10 +169,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
         if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
-          if (RequiresFirstSectionOutsideFirstPage) {
-            SectOffset = alignToPowerOf2(SectOffset, PageSize);
-            RequiresFirstSectionOutsideFirstPage = false;
-          }
           Sec->Offset = SegOffset + SectOffset;
           Sec->Size = Sec->Content.size();
           SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index e0819d89d24ff..8d2c02dc37c99 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -98,10 +98,6 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      EncryptionInfoCommandIndex = Index;
-      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 13ac87ed3ed06..8f9444f5fb025 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -341,9 +341,6 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
-  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
-  /// if present.
-  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index ef0e0262f9395..2b344f36d8e78 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,10 +184,6 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
-      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 8cf748aa5681c..b6318bbe3ab74 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -274,11 +274,13 @@ static bool supportsAmdgpu(uint64_t Type) {
 }
 
 static uint64_t resolveAmdgpu(uint64_t Type, uint64_t Offset, uint64_t S,
-                              uint64_t /*LocData*/, int64_t Addend) {
+                              uint64_t LocData, int64_t Addend) {
+  assert((LocData == 0 || Addend == 0) &&
+         "one of LocData and Addend must be 0");
   switch (Type) {
   case ELF::R_AMDGPU_ABS32:
   case ELF::R_AMDGPU_ABS64:
-    return S + Addend;
+    return S + LocData + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index b5c3719f57963..c6e79af44b9b4 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -34,6 +34,9 @@
 #ifdef HAVE_GETAUXVAL
 #include <sys/auxv.h>
 #endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
@@ -304,31 +307,40 @@ bool Process::FileDescriptorIsDisplayed(int fd) {
 #endif
 }
 
-static unsigned getColumns() {
+static unsigned getColumns(int FileID) {
   // If COLUMNS is defined in the environment, wrap to that many columns.
+  // This matches GCC.
   if (const char *ColumnsStr = std::getenv("COLUMNS")) {
     int Columns = std::atoi(ColumnsStr);
     if (Columns > 0)
       return Columns;
   }
 
-  // We used to call ioctl TIOCGWINSZ to determine the width. It is considered
-  // unuseful.
-  return 0;
+  // Some shells do not export COLUMNS; query the column count via ioctl()
+  // instead if it isn't available.
+  unsigned Columns = 0;
+
+#if defined(HAVE_SYS_IOCTL_H) && !defined(__sun__)
+  struct winsize ws;
+  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
+    Columns = ws.ws_col;
+#endif
+
+  return Columns;
 }
 
 unsigned Process::StandardOutColumns() {
   if (!StandardOutIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(0);
 }
 
 unsigned Process::StandardErrColumns() {
   if (!StandardErrIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(1);
 }
 
 static bool terminalHasColors() {
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3f92c1dbfbf49..4099f40ea07fd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -58,6 +58,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -3534,7 +3535,8 @@ INITIALIZE_PASS(AArch64AsmPrinter, "aarch64-asm-printer",
                 "AArch64 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 2650c621e19f6..7ffe779f2408d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2896,8 +2896,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
 
   if (isSVE) {
-    assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
-           "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
     StackOffset FPOffset =
         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
     StackOffset SPOffset =
@@ -2905,6 +2903,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
                          ObjectOffset);
     if (FPAfterSVECalleeSaves) {
+      assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
+             "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
       FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
     }
     // Always use the FP for SVE spills if available and beneficial.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7519ac5260a64..1169efce3123f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
-    "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
-    cl::desc("Use the new method of lowering partial reductions."));
-
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -966,27 +959,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
-#define LCALLNAMES(A, B, N)                                                    \
-  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
-  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
-  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
-  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
-#define LCALLNAME4(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
-#define LCALLNAME5(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2)                                                          \
-  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
-    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
-#undef LCALLNAMES
-#undef LCALLNAME4
-#undef LCALLNAME5
   }
 
   if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
@@ -1457,7 +1429,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
 
-    if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+    if (Subtarget->hasDotProd()) {
       static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                         ISD::PARTIAL_REDUCE_UMLA};
 
@@ -1895,7 +1867,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // Handle partial reduction operations
-  if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
     // Other pairs will default to 'Expand'.
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
@@ -1957,17 +1929,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
 
-      if (EnablePartialReduceNodes) {
-        static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                          ISD::PARTIAL_REDUCE_UMLA};
-        // Must be lowered to SVE instructions.
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
-      }
+      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                        ISD::PARTIAL_REDUCE_UMLA};
+      // Must be lowered to SVE instructions.
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
     }
   }
 
@@ -2165,16 +2135,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   assert(I->getIntrinsicID() ==
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Unexpected intrinsic!");
-  if (EnablePartialReduceNodes)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
-    return false;
   return true;
 }
 
@@ -2252,37 +2212,32 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
 
-  if (EnablePartialReduceNodes) {
-    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
-    unsigned NumElts = VT.getVectorNumElements();
-    if (VT.getVectorElementType() == MVT::i64) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i32) {
-      setPartialReduceMLAAction(MLAOps, VT,
+  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                    ISD::PARTIAL_REDUCE_UMLA};
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.getVectorElementType() == MVT::i64) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i32) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i16) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
+  }
+  if (Subtarget->hasMatMulInt8()) {
+    if (VT.getVectorElementType() == MVT::i32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i16) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
-    }
-
-    if (Subtarget->hasMatMulInt8()) {
-      if (VT.getVectorElementType() == MVT::i32)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
-                                  Custom);
-      else if (VT.getVectorElementType() == MVT::i64)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
-                                  Custom);
-    }
+    else if (VT.getVectorElementType() == MVT::i64)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.
@@ -5106,9 +5061,10 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
+  CallingConv::ID CC = getLibcallCallingConv(LC);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+      .setLibCallee(CC, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 0c0b512e3b6ce..75c7dd944b467 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -369,8 +369,7 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
 
   unsigned Count = 0;
   for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
-    if (!isa<DbgInfoIntrinsic>(*BI))
-      ++Count;
+    ++Count;
 
     if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
       continue;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 50f52cca6c8ac..8150e91c8ba52 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -223,7 +224,8 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for AArch64"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..ed051f295752e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4353,15 +4353,26 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
     }
   }
 
-  // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
-  // FIXME: This can apply to more conditions and add/sub if it can be shown to
-  // be profitable.
+  // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
+  // icmp(and, 0) as free, as we can make use of ands, but only if the
+  // comparison is not unsigned.
   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
-      ICmpInst::isEquality(VecPred) &&
+      !CmpInst::isUnsigned(VecPred) &&
       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
-      match(I->getOperand(1), m_Zero()) &&
-      match(I->getOperand(0), m_And(m_Value(), m_Value())))
-    return 0;
+      match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
+    if (match(I->getOperand(1), m_Zero()))
+      return 0;
+
+    // x >= 1 / x < 1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_One()) &&
+        (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
+      return 0;
+
+    // x <= -1 / x > -1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_AllOnes()) &&
+        (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
+      return 0;
+  }
 
   // The base case handles scalable vectors fine for now, since it treats the
   // cost as 1 * legalization cost.
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2f67ff55f26b7..d8bdc01a3454f 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8322,7 +8322,8 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmParser() {
   RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
   RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
   RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index bab0cbe7788e9..ae984be670fc2 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -310,7 +310,8 @@ createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                        SymbolLookUp, DisInfo);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
                                          createAArch64Disassembler);
   TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index a82896dbe0d6c..b2cd1d0f4156e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -184,8 +184,7 @@ void AArch64MCAsmInfoDarwin::printSpecifierExpr(
     raw_ostream &OS, const MCSpecifierExpr &Expr) const {
   if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
     return AE->print(OS, this);
-  // FIXME: tryParseAdrLabel should not use VK_ABS for Mach-O
-  assert(Expr.getSpecifier() == AArch64MCExpr::VK_ABS);
+  OS << AArch64::getSpecifierName(Expr);
   printExpr(OS, *Expr.getSubExpr());
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index b7959e02ec268..efc13589bab63 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
@@ -503,7 +504,8 @@ static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetMC() {
   for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
                     &getTheAArch64_32Target(), &getTheARM64Target(),
                     &getTheARM64_32Target()}) {
diff --git a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 52c88fd0218d6..c9ebd3b4a6517 100644
--- a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 Target &llvm::getTheAArch64leTarget() {
@@ -31,7 +32,8 @@ Target &llvm::getTheARM64_32Target() {
   return TheARM64_32Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetInfo() {
   // Now register the "arm64" name for use with "-march". We don't want it to
   // take possession of the Triple::aarch64 tags though.
   TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 491314daf2d81..84b0f98554097 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -83,7 +84,8 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
                                      llvm::createR600AsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9be8821d5bf96..5477c5eae9392 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -60,28 +60,6 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
   return maxnum(Src0, Src1);
 }
 
-enum class KnownIEEEMode { Unknown, On, Off };
-
-/// Return KnownIEEEMode::On if we know if the use context can assume
-/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
-/// "amdgpu-ieee"="false".
-static KnownIEEEMode fpenvIEEEMode(const Instruction &I,
-                                   const GCNSubtarget &ST) {
-  if (!ST.hasIEEEMode()) // Only mode on gfx12
-    return KnownIEEEMode::On;
-
-  const Function *F = I.getFunction();
-  if (!F)
-    return KnownIEEEMode::Unknown;
-
-  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
-  if (IEEEAttr.isValid())
-    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
-
-  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
-                                               : KnownIEEEMode::On;
-}
-
 // Check if a value can be converted to a 16-bit value without losing
 // precision.
 // The value is expected to be either a float (IsFloat = true) or an unsigned
@@ -270,6 +248,66 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
                                      });
         }
       }
+
+      // Only perform D16 folding if every user of the image sample is
+      // an ExtractElementInst immediately followed by an FPTrunc to half.
+      SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
+          ExtractTruncPairs;
+      bool AllHalfExtracts = true;
+
+      for (User *U : II.users()) {
+        auto *Ext = dyn_cast<ExtractElementInst>(U);
+        if (!Ext || !Ext->hasOneUse()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
+        if (!Tr || !Tr->getType()->isHalfTy()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        ExtractTruncPairs.emplace_back(Ext, Tr);
+      }
+
+      if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
+        auto *VecTy = cast<VectorType>(II.getType());
+        Type *HalfVecTy =
+            VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
+
+        // Obtain the original image sample intrinsic's signature
+        // and replace its return type with the half-vector for D16 folding
+        SmallVector<Type *, 8> SigTys;
+        Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
+        SigTys[0] = HalfVecTy;
+
+        Module *M = II.getModule();
+        Function *HalfDecl =
+            Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
+
+        II.mutateType(HalfVecTy);
+        II.setCalledFunction(HalfDecl);
+
+        IRBuilder<> Builder(II.getContext());
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          Value *Idx = Ext->getIndexOperand();
+
+          Builder.SetInsertPoint(Tr);
+
+          Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
+          HalfExtract->takeName(Tr);
+
+          Tr->replaceAllUsesWith(HalfExtract);
+        }
+
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          IC.eraseInstFromFunction(*Tr);
+          IC.eraseInstFromFunction(*Ext);
+        }
+
+        return &II;
+      }
     }
   }
 
@@ -1004,7 +1042,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // TODO: Also can fold to 2 operands with infinities.
     if ((match(Src0, m_APFloat(ConstSrc0)) && ConstSrc0->isNaN()) ||
         isa<UndefValue>(Src0)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc0 && ConstSrc0->isSignaling())
@@ -1019,7 +1057,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src1, m_APFloat(ConstSrc1)) && ConstSrc1->isNaN()) ||
                isa<UndefValue>(Src1)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc1 && ConstSrc1->isSignaling())
@@ -1035,7 +1073,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src2, m_APFloat(ConstSrc2)) && ConstSrc2->isNaN()) ||
                isa<UndefValue>(Src2)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         if (ConstSrc2 && ConstSrc2->isSignaling()) {
           auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..7a50923ffedc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -92,6 +92,8 @@ def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().F
 def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
 def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">;
 def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
+def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
+def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff85064383..f82e6df9bcbfc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,12 +957,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
-  auto &MinNumMaxNum = getActionDefinitionsBuilder({
-      G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
-
-  // TODO: These should be custom lowered and are directly legal with IEEE=0
-  auto &MinimumNumMaximumNum =
-      getActionDefinitionsBuilder({G_FMINIMUMNUM, G_FMAXIMUMNUM});
+  auto &MinNumMaxNum = getActionDefinitionsBuilder(
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
+       G_FMAXNUM_IEEE});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -980,8 +977,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  MinimumNumMaximumNum.lower();
-
   if (ST.hasVOP3PInsts())
     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 
@@ -2162,6 +2157,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeFPTOI(MI, MRI, B, false);
   case TargetOpcode::G_FMINNUM:
   case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINIMUMNUM:
+  case TargetOpcode::G_FMAXIMUMNUM:
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
@@ -2741,9 +2738,17 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
 
   // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINNUM/G_FMAXNUM
-  if (!MFI->getMode().IEEE)
+  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
+  //
+  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
+  // enabled.
+  if (!MFI->getMode().IEEE) {
+    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
+        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
+      return true;
+
     return !IsIEEEOp;
+  }
 
   if (IsIEEEOp)
     return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e90a3a275f67c..700dc87d2f821 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,39 +818,6 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
-/// type is non-homogeneous.
-static Type *getHomogeneousType(Type *Ty) {
-  Type *ElemTy = nullptr;
-  SmallVector<Type *> WorkList;
-  WorkList.push_back(Ty);
-  while (!WorkList.empty()) {
-    Type *CurTy = WorkList.pop_back_val();
-
-    // Check if the current type is an aggregate type.
-    if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
-      WorkList.push_back(VectorTy->getElementType());
-      continue;
-    }
-    if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
-      WorkList.push_back(ArrayTy->getElementType());
-      continue;
-    }
-    if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
-      WorkList.append(StructTy->element_begin(), StructTy->element_end());
-      continue;
-    }
-
-    // If not, it must be the same as all other non-aggregate types.
-    if (!ElemTy)
-      ElemTy = CurTy;
-    else if (ElemTy != CurTy)
-      return nullptr;
-  }
-
-  return ElemTy;
-}
-
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -861,42 +828,42 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  Type *ElemTy = getHomogeneousType(AllocaTy);
-
-  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
-  }
+  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+    uint64_t NumElems = 1;
+    Type *ElemTy;
+    do {
+      NumElems *= ArrayTy->getNumElements();
+      ElemTy = ArrayTy->getElementType();
+    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
+
+    // Check for array of vectors
+    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
+    if (InnerVectorTy) {
+      NumElems *= InnerVectorTy->getNumElements();
+      ElemTy = InnerVectorTy->getElementType();
+    }
 
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  if (ElementSize == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements\n");
-    return false;
+    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
+      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
+    }
   }
 
-  // Calculate the size of the corresponding vector, accounting for padding of
-  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
-  // match that of alloca for correct behaviour of byte offsets and GEP
-  // computation.
-  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-  unsigned NumElems = AllocaSize / ElementSize;
-  if (NumElems == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type\n");
-    return false;
-  }
-  if (NumElems * ElementSize != AllocaSize) {
-    LLVM_DEBUG(
-        dbgs() << "  Cannot convert type into vector of the same size\n");
+  if (!VectorTy) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
-  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
-  assert(VectorTy && "Failed to create vector type.");
 
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -928,6 +895,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
+  Type *VecEltTy = VectorTy->getElementType();
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -967,7 +943,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..4391a48ff2b68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,6 +4009,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
+  case AMDGPU::G_FMINIMUMNUM:
+  case AMDGPU::G_FMAXIMUMNUM:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_STRICT_FADD:
   case AMDGPU::G_STRICT_FSUB:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d59087839b0e1..f390d39043ed5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -89,6 +89,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
@@ -481,7 +482,7 @@ static cl::opt<bool> HasClosedWorldAssumption(
     cl::desc("Whether has closed-world assumption at link time"),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..d5a1aaef4ad68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -685,6 +685,8 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum:
   case Intrinsic::canonicalize:
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
@@ -702,8 +704,29 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
 InstructionCost
 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                   TTI::TargetCostKind CostKind) const {
-  if (ICA.getID() == Intrinsic::fabs)
+  switch (ICA.getID()) {
+  case Intrinsic::fabs:
+    // Free source modifier in the common case.
+    return 0;
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
+    // TODO: If hasPackedTID, or if the calling context is not an entry point
+    // there may be a bit instruction.
+    return 0;
+  case Intrinsic::amdgcn_workgroup_id_x:
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::amdgcn_lds_kernel_id:
+  case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_dispatch_id:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+  case Intrinsic::amdgcn_queue_ptr:
+    // Read from an argument register.
     return 0;
+  default:
+    break;
+  }
 
   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -718,10 +741,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
-  if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost(CostKind);
-
-  if ((ST->has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
@@ -731,6 +751,11 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   switch (ICA.getID()) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+    if (SLT == MVT::f64) {
+      InstRate = get64BitInstrCost(CostKind);
+      break;
+    }
+
     if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
       InstRate = getFullRateInstrCost();
     else {
@@ -740,9 +765,26 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     break;
   case Intrinsic::copysign:
     return NElts * getFullRateInstrCost();
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum: {
+    // Instruction + 2 canonicalizes. For cases that need type promotion, we the
+    // promotion takes the place of the canonicalize.
+    unsigned NumOps = 3;
+    if (const IntrinsicInst *II = ICA.getInst()) {
+      // Directly legal with ieee=0
+      // TODO: Not directly legal with strictfp
+      if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)
+        NumOps = 1;
+    }
+
+    unsigned BaseRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
+    InstRate = BaseRate * NumOps;
+    break;
+  }
   case Intrinsic::canonicalize: {
-    assert(SLT != MVT::f64);
-    InstRate = getFullRateInstrCost();
+    InstRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
     break;
   }
   case Intrinsic::uadd_sat:
@@ -1443,3 +1485,20 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+GCNTTIImpl::KnownIEEEMode
+GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
+  if (!ST->hasIEEEMode()) // Only mode on gfx12
+    return KnownIEEEMode::On;
+
+  const Function *F = I.getFunction();
+  if (!F)
+    return KnownIEEEMode::Unknown;
+
+  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
+  if (IEEEAttr.isValid())
+    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
+
+  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
+                                               : KnownIEEEMode::On;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631a..0fae301abf532 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -281,6 +281,13 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
+
+  enum class KnownIEEEMode { Unknown, On, Off };
+
+  /// Return KnownIEEEMode::On if we know if the use context can assume
+  /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
+  /// "amdgpu-ieee"="false".
+  KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0dc1d13773229..30dcd6d81f16d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <optional>
@@ -9800,7 +9801,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmParser() {
   RegisterMCAsmParser<AMDGPUAsmParser> A(getTheR600Target());
   RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ca0093d1f049c..349e408b79658 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -2648,7 +2649,8 @@ static MCDisassembler *createAMDGPUDisassembler(const Target &T,
   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
                                          createAMDGPUDisassembler);
   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 2768e0c23cf01..b8f43c4550b7e 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/WithColor.h"
 
 namespace llvm::mca {
@@ -353,7 +354,8 @@ createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
 
 /// Extern function to initialize the targets for the AMDGPU backend
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMCA() {
   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
                                           createAMDGPUCustomBehaviour);
   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c692895d84c00..d66725d3a6c4b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -156,7 +157,8 @@ static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
   return new AMDGPUMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMC() {
 
   TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 30535ae88f7ba..586de433ea28a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -531,8 +531,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
                        Legal);
 
-  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
-                     Custom);
+  setOperationAction(
+      {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+      {MVT::f32, MVT::f64}, Custom);
 
   // These are really only legal for ieee_mode functions. We should be avoiding
   // them for functions that don't have ieee_mode enabled, so just say they are
@@ -771,7 +772,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         MVT::v32f16, MVT::v32bf16},
                        Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        MVT::f16, Custom);
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
@@ -825,8 +828,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
                          VT, Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
-                       Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        {MVT::v2f16, MVT::v4f16}, Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
@@ -6062,6 +6066,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return lowerFMINNUM_FMAXNUM(Op, DAG);
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
+    return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
     return lowerFMINIMUM_FMAXIMUM(Op, DAG);
@@ -6086,8 +6093,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINIMUMNUM:
-  case ISD::FMAXIMUMNUM:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
   case ISD::SADDSAT:
@@ -6995,6 +7000,23 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   return Op;
 }
 
+SDValue
+SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  bool IsIEEEMode = Info->getMode().IEEE;
+
+  if (IsIEEEMode)
+    return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+      VT == MVT::v16bf16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -13880,6 +13902,17 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
       return Res;
   }
 
+  // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
+  // for some types, but at a higher cost since it's implemented with a 3
+  // operand form.
+  const SDNodeFlags Flags = N->getFlags();
+  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
+      !Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
+    unsigned NewOpc =
+        Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d71a22722129e..89fb12b52c3e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitFP_ROUNDVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e74ccbee975ab..768f57c469d64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2648,11 +2648,10 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
-   HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
+   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0FloatMods, HasSrc1FloatMods,
+   HasSrc2FloatMods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
-  field string AsmVOP3OpSel = AsmVOP3Base;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7c..897c30948cf06 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1390,6 +1390,55 @@ def : GCNPat<
   (S_ADD_U64_PSEUDO $src0, $src1)>;
 }
 
+//===----------------------------------------------------------------------===//
+// FP min/max patterns
+//===----------------------------------------------------------------------===//
+
+
+class FPBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (vt (VOP3Mods vt:$src0, i32:$src0_mods)),
+                      (vt (VOP3Mods vt:$src1, i32:$src1_mods)))),
+    (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+class FPPkBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (VOP3PMods v2f16:$src0, i32:$src0_mods),
+                      (VOP3PMods v2f16:$src1, i32:$src1_mods))),
+  (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE)
+>;
+
+/// With IEEE=0, signalingness is ignored and the non-nan input will
+/// be directly returned.
+let OtherPredicates = [IEEEModeDisabled] in {
+  def : FPBinOpPat<fminimumnum, f32, V_MIN_F32_e64>;
+  def : FPBinOpPat<fmaximumnum, f32, V_MAX_F32_e64>;
+  def : FPBinOpPat<fminimumnum, f64, V_MIN_F64_e64>;
+  def : FPBinOpPat<fmaximumnum, f64, V_MAX_F64_e64>;
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = NotHasTrue16BitInsts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseRealTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_t16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_t16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseFakeTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_fake16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_fake16_e64>;
+  }
+
+  let SubtargetPredicate = HasVOP3PInsts in {
+    def : FPPkBinOpPat<fminimumnum, v2f16, V_PK_MIN_F16>;
+    def : FPPkBinOpPat<fmaximumnum, v2f16, V_PK_MAX_F16>;
+  }
+}
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 98fd16e59bf1f..ad547556cf150 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -28,7 +29,8 @@ Target &llvm::getTheGCNTarget() {
 }
 
 /// Extern function to initialize the targets for the AMDGPU backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetInfo() {
   RegisterTarget<Triple::r600, false> R600(getTheR600Target(), "r600",
                                            "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
   RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 7fdd951ecbd3c..926df955881e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -393,7 +393,6 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let Asm64 = "$vdst, $src0$bound_ctrl$fi";
-  let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi";
 }
 
 // Special case because there are no true output operands.  Hack vdst
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a005e0245b8ff..2dbc119f65cda 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -806,7 +806,7 @@ def : GCNPat<
  (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
  (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
 
-let SubtargetPredicate = isGFX940Plus in
+let SubtargetPredicate = HasLshlAddU64Inst in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
   (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
@@ -1126,6 +1126,9 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
   let HasModifiers = 0;
   let HasSrc0IntMods = 0;
   let HasSrc1IntMods = 0;
+  let HasSrc0FloatMods = 0;
+  let HasSrc1FloatMods = 0;
+  let HasSrc2FloatMods = 0;
   let HasOMod = 0;
   let HasOpSel = 0;
   let HasClamp = 0;
@@ -1562,9 +1565,12 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
   def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
 }
 
+let HasModifiers = 1 in
+def ASHR_PK_I8_Profile : VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>;
+
 let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
-  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
-  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
+  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_i8_i32>;
+  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_u8_i32>;
 } // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
 
 class AshrPkI8Pat<VOP3_Pseudo inst, int lo, int hi>: GCNPat<
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fa14370025515..1443747709b7a 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -40,6 +40,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2456,7 +2457,8 @@ INITIALIZE_PASS(ARMAsmPrinter, "arm-asm-printer", "ARM Assembly Printer", false,
 //===----------------------------------------------------------------------===//
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
   RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
   RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5b3664c4e961f..05d8a1190ada8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -522,67 +522,69 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+      // clang-format off
       static const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
-        const ISD::CondCode Cond;
+        const CmpInst::Predicate Cond;
       } LibraryCalls[] = {
         // Single-precision floating-point arithmetic.
-        { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F32, "__addsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F32, "__subsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F32, "__mulsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F32, "__divsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Double-precision floating-point arithmetic.
-        { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F64, "__adddf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F64, "__subdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F64, "__muldf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F64, "__divdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Single-precision comparisons.
-        { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F32, "__eqsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F32, "__nesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F32, "__ltsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F32, "__lesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F32, "__gesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F32, "__gtsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F32,  "__unordsf2vfp", CmpInst::ICMP_NE },
 
         // Double-precision comparisons.
-        { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F64, "__eqdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F64, "__nedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F64, "__ltdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F64, "__ledf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F64, "__gedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F64, "__gtdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F64,  "__unorddf2vfp", CmpInst::ICMP_NE },
 
         // Floating-point to integer conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
-        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
-        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Conversions between floating types.
-        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
-        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
+        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Integer to floating-point conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
         // e.g., __floatunsidf vs. __floatunssidfvfp.
-        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
-        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", CmpInst::BAD_ICMP_PREDICATE },
       };
+      // clang-format on
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
-        if (LC.Cond != ISD::SETCC_INVALID)
+        if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
@@ -592,97 +594,99 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+    // clang-format off
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
-      const ISD::CondCode Cond;
+      const CmpInst::Predicate Cond;
     } LibraryCalls[] = {
       // Double-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 2
-      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Double-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 3
-      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 4
-      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 5
-      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Floating-point to integer conversions.
       // RTABI chapter 4.1.2, Table 6
-      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Conversions between floating types.
       // RTABI chapter 4.1.2, Table 7
-      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer to floating-point conversions.
       // RTABI chapter 4.1.2, Table 8
-      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Long long helper functions
       // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer division functions
       // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
     };
+    // clang-format on
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
-      if (LC.Cond != ISD::SETCC_INVALID)
+      if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
         setCmpLibcallCC(LC.Op, LC.Cond);
     }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 563e69a65ab3b..fee77a44e5e80 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
@@ -82,7 +83,7 @@ namespace llvm {
   void initializeARMExecutionDomainFixPass(PassRegistry&);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
   RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f3bdcd64805d8..25f0273013373 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12722,7 +12722,7 @@ bool ARMAsmParser::parseDirectiveSEHCustom(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
   RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
   RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
   RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ef30b1aafb28b..5f930fb0c8071 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1269,7 +1269,8 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
                                          createARMDisassembler);
   TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index c756bff3b501a..2d22b27ceb131 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -770,7 +771,7 @@ bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
                     &getTheThumbLETarget(), &getTheThumbBETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 4d514f3ca4442..3e3670d4e0192 100644
--- a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -8,6 +8,8 @@
 
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 Target &llvm::getTheARMLETarget() {
@@ -27,7 +29,8 @@ Target &llvm::getTheThumbBETarget() {
   return TheThumbBETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMTargetInfo() {
   RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
                                                  "ARM", "ARM");
   RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ed537f8cc7178..ad8aa5717fb42 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -16,7 +16,7 @@
 #include "AVRSubtarget.h"
 #include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRInstPrinter.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -215,7 +216,7 @@ const MCExpr *AVRAsmPrinter::lowerConstant(const Constant *CV,
     bool IsProgMem = GV->getAddressSpace() == AVR::ProgramMemory;
     if (IsProgMem) {
       const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GV), Ctx);
-      return AVRMCExpr::create(AVRMCExpr::VK_PM, Expr, false, Ctx);
+      return AVRMCExpr::create(AVR::S_PM, Expr, false, Ctx);
     }
   }
 
@@ -335,6 +336,7 @@ char AVRAsmPrinter::ID = 0;
 INITIALIZE_PASS(AVRAsmPrinter, "avr-asm-printer", "AVR Assembly Printer", false,
                 false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRAsmPrinter() {
   llvm::RegisterAsmPrinter<AVRAsmPrinter> X(getTheAVRTarget());
 }
diff --git a/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
index 47d9073f6eb84..f4bddfdac3461 100644
--- a/llvm/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -13,7 +13,7 @@
 
 #include "AVRMCInstLower.h"
 #include "AVRInstrInfo.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/Mangler.h"
@@ -42,19 +42,19 @@ AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
 
   if (TF & AVRII::MO_LO) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_LO8_GS
-                                                        : AVRMCExpr::VK_PM_LO8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_LO8_GS
+                                                        : AVR::S_PM_LO8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_LO8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_LO8, Expr, IsNegated, Ctx);
     }
   } else if (TF & AVRII::MO_HI) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_HI8_GS
-                                                        : AVRMCExpr::VK_PM_HI8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_HI8_GS
+                                                        : AVR::S_PM_HI8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_HI8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_HI8, Expr, IsNegated, Ctx);
     }
   } else if (TF != 0) {
     llvm_unreachable("Unknown target flag on symbol operand");
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 72544b0afd8d2..b75417a0896a5 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
@@ -87,7 +88,7 @@ void AVRPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
   // Register the target.
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index cab5caffdcba8..012cf2c70e2e5 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRRegisterInfo.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCELFStreamer.h"
-#include "MCTargetDesc/AVRMCExpr.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -447,7 +448,7 @@ bool AVRAsmParser::tryParseExpression(OperandVector &Operands, int64_t offset) {
 
 bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   bool isNegated = false;
-  AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE;
+  AVR::Specifier ModifierKind = AVR::S_AVR_NONE;
 
   SMLoc S = Parser.getTok().getLoc();
 
@@ -473,14 +474,14 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   StringRef ModifierName = Parser.getTok().getString();
   ModifierKind = AVRMCExpr::parseSpecifier(ModifierName);
 
-  if (ModifierKind != AVRMCExpr::VK_AVR_NONE) {
+  if (ModifierKind != AVR::S_AVR_NONE) {
     Parser.Lex();
     Parser.Lex(); // Eat modifier name and parenthesis
     if (Parser.getTok().getString() == GENERATE_STUBS &&
         Parser.getTok().getKind() == AsmToken::Identifier) {
       std::string GSModName = ModifierName.str() + "_" + GENERATE_STUBS;
       ModifierKind = AVRMCExpr::parseSpecifier(GSModName);
-      if (ModifierKind != AVRMCExpr::VK_AVR_NONE)
+      if (ModifierKind != AVR::S_AVR_NONE)
         Parser.Lex(); // Eat gs modifier name
     }
   } else {
@@ -698,15 +699,15 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Tokens[1].getKind() == AsmToken::Identifier) {
     MCSymbol *Symbol = getContext().getOrCreateSymbol(".text");
     AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L,
-                                        AVRMCExpr::VK_AVR_NONE);
+                                        AVR::S_AVR_NONE);
     return ParseStatus::NoMatch;
   }
 
   if (Parser.getTok().getKind() == AsmToken::Identifier &&
       Parser.getLexer().peekTok().getKind() == AsmToken::LParen) {
     StringRef ModifierName = Parser.getTok().getString();
-    AVRMCExpr::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
-    if (Spec != AVRMCExpr::VK_AVR_NONE) {
+    AVR::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
+    if (Spec != AVR::S_AVR_NONE) {
       Parser.Lex();
       Parser.Lex(); // Eat the modifier and parenthesis
     } else {
@@ -731,7 +732,7 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
   RegisterMCAsmParser<AVRAsmParser> X(getTheAVRTarget());
 }
 
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 70428673fcd8d..c7a584868f4e6 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -23,6 +23,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "avr-disassembler"
@@ -50,7 +52,8 @@ static MCDisassembler *createAVRDisassembler(const Target &T,
   return new AVRDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
                                          createAVRDisassembler);
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index e79ba29e0cbec..619efb376c613 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AVRFixupKinds.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/MC/MCAssembler.h"
@@ -36,42 +36,42 @@ AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
 unsigned AVRELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           bool IsPCRel) const {
-  auto Modifier = AVRMCExpr::Specifier(Target.getSpecifier());
+  auto Spec = Target.getSpecifier();
   switch ((unsigned)Fixup.getKind()) {
   case FK_Data_1:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_8;
-    case AVRMCExpr::VK_DIFF8:
+    case AVR::S_DIFF8:
       return ELF::R_AVR_DIFF8;
-    case AVRMCExpr::VK_LO8:
+    case AVR::S_LO8:
       return ELF::R_AVR_8_LO8;
-    case AVRMCExpr::VK_HI8:
+    case AVR::S_HI8:
       return ELF::R_AVR_8_HI8;
-    case AVRMCExpr::VK_HH8:
+    case AVR::S_HH8:
       return ELF::R_AVR_8_HLO8;
     }
   case FK_Data_4:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_32;
-    case AVRMCExpr::VK_DIFF32:
+    case AVR::S_DIFF32:
       return ELF::R_AVR_DIFF32;
     }
   case FK_Data_2:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_16;
-    case AVRMCExpr::VK_AVR_NONE:
-    case AVRMCExpr::VK_PM:
+    case AVR::S_AVR_NONE:
+    case AVR::S_PM:
       return ELF::R_AVR_16_PM;
-    case AVRMCExpr::VK_DIFF16:
+    case AVR::S_DIFF16:
       return ELF::R_AVR_DIFF16;
     }
   case AVR::fixup_32:
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index d37e39c51e159..cfd7dc5822627 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -26,3 +29,196 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
 }
+
+namespace {
+const struct ModifierEntry {
+  const char *const Spelling;
+  AVRMCExpr::Specifier specifier;
+} ModifierNames[] = {
+    {"lo8", AVR::S_LO8},       {"hi8", AVR::S_HI8},
+    {"hh8", AVR::S_HH8}, // synonym with hlo8
+    {"hlo8", AVR::S_HH8},      {"hhi8", AVR::S_HHI8},
+
+    {"pm", AVR::S_PM},         {"pm_lo8", AVR::S_PM_LO8},
+    {"pm_hi8", AVR::S_PM_HI8}, {"pm_hh8", AVR::S_PM_HH8},
+
+    {"lo8_gs", AVR::S_LO8_GS}, {"hi8_gs", AVR::S_HI8_GS},
+    {"gs", AVR::S_GS},
+};
+
+} // end of anonymous namespace
+
+AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
+        return Mod.Spelling == Name;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->specifier;
+  }
+  return AVR::S_AVR_NONE;
+}
+
+const char *AVRMCExpr::getName() const {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
+        return Mod.specifier == specifier;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->Spelling;
+  }
+  return nullptr;
+}
+
+AVR::Fixups AVRMCExpr::getFixupKind() const {
+  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
+    break;
+  case AVR::S_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
+    break;
+  case AVR::S_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
+    break;
+  case AVR::S_HHI8:
+    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
+    break;
+
+  case AVR::S_PM_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
+    break;
+  case AVR::S_PM_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
+    break;
+  case AVR::S_PM_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Kind = AVR::fixup_16_pm;
+    break;
+  case AVR::S_LO8_GS:
+    Kind = AVR::fixup_lo8_ldi_gs;
+    break;
+  case AVR::S_HI8_GS:
+    Kind = AVR::fixup_hi8_ldi_gs;
+    break;
+
+  default:
+    llvm_unreachable("Uninitialized expression");
+  }
+
+  return Kind;
+}
+
+void AVRMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                      const MCSpecifierExpr &Expr) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
+  assert(E.getSpecifier() != AVR::S_AVR_NONE);
+  OS << E.getName() << '(';
+  if (E.isNegated())
+    OS << '-' << '(';
+  printExpr(OS, *E.getSubExpr());
+  if (E.isNegated())
+    OS << ')';
+  OS << ')';
+}
+
+int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
+  if (Negated)
+    Value *= -1;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Value &= 0xff;
+    break;
+  case AVR::S_HI8:
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_HH8:
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_HHI8:
+    Value &= 0xff000000;
+    Value >>= 24;
+    break;
+  case AVR::S_PM_LO8:
+  case AVR::S_LO8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff;
+    break;
+  case AVR::S_PM_HI8:
+  case AVR::S_HI8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_PM_HH8:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    break;
+
+  case AVR::S_AVR_NONE:
+  default:
+    llvm_unreachable("Uninitialized expression.");
+  }
+  return static_cast<uint64_t>(Value) & 0xff;
+}
+
+// bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+//                                           const MCAssembler *Asm) const {
+bool AVRMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                             MCValue &Result,
+                                             const MCAssembler *Asm) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
+  MCValue Value;
+  bool isRelocatable = E.getSubExpr()->evaluateAsRelocatable(Value, Asm);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = MCValue::get(E.evaluateAsInt64(Value.getConstant()));
+  } else {
+    if (!Asm || !Asm->hasLayout())
+      return false;
+
+    auto Spec = AVR::S_None;
+    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
+      return false;
+    assert(!Value.getSubSym());
+    if (E.getSpecifier() == AVR::S_PM)
+      Spec = AVR::S_PM;
+
+    // TODO: don't attach specifier to MCSymbolRefExpr.
+    Result =
+        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
+  }
+
+  return true;
+}
+
+bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
+  MCValue Value;
+  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = evaluateAsInt64(Value.getConstant());
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index 17dd77f6266a1..fab271304e27d 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_AVR_ASM_INFO_H
 #define LLVM_AVR_ASM_INFO_H
 
+#include "MCTargetDesc/AVRMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
@@ -23,8 +25,39 @@ class Triple;
 class AVRMCAsmInfo : public MCAsmInfo {
 public:
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
+namespace AVR {
+using Specifier = uint16_t;
+enum {
+  S_None,
+
+  S_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
+
+  S_HI8,  ///< Corresponds to `hi8()`.
+  S_LO8,  ///< Corresponds to `lo8()`.
+  S_HH8,  ///< Corresponds to `hlo8() and hh8()`.
+  S_HHI8, ///< Corresponds to `hhi8()`.
+
+  S_PM,     ///< Corresponds to `pm()`, reference to program memory.
+  S_PM_LO8, ///< Corresponds to `pm_lo8()`.
+  S_PM_HI8, ///< Corresponds to `pm_hi8()`.
+  S_PM_HH8, ///< Corresponds to `pm_hh8()`.
+
+  S_LO8_GS, ///< Corresponds to `lo8(gs())`.
+  S_HI8_GS, ///< Corresponds to `hi8(gs())`.
+  S_GS,     ///< Corresponds to `gs()`.
+
+  S_DIFF8,
+  S_DIFF16,
+  S_DIFF32,
+};
+} // namespace AVR
+
 } // end namespace llvm
 
 #endif // LLVM_AVR_ASM_INFO_H
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index fa01dad5ec128..4934e1c71bc03 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -12,7 +12,7 @@
 
 #include "AVRMCCodeEmitter.h"
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/ADT/APFloat.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 88393fb9928a4..0644f422b328e 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -23,19 +23,19 @@ using namespace llvm;
 void AVRMCELFStreamer::emitValueForModiferKind(
     const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc,
     AVRMCExpr::Specifier ModifierKind) {
-  AVRMCExpr::Specifier Kind = AVRMCExpr::VK_AVR_NONE;
-  if (ModifierKind == AVRMCExpr::VK_AVR_NONE) {
-    Kind = AVRMCExpr::VK_DIFF8;
+  AVRMCExpr::Specifier Kind = AVR::S_AVR_NONE;
+  if (ModifierKind == AVR::S_AVR_NONE) {
+    Kind = AVR::S_DIFF8;
     if (SizeInBytes == SIZE_LONG)
-      Kind = AVRMCExpr::VK_DIFF32;
+      Kind = AVR::S_DIFF32;
     else if (SizeInBytes == SIZE_WORD)
-      Kind = AVRMCExpr::VK_DIFF16;
-  } else if (ModifierKind == AVRMCExpr::VK_LO8)
-    Kind = AVRMCExpr::VK_LO8;
-  else if (ModifierKind == AVRMCExpr::VK_HI8)
-    Kind = AVRMCExpr::VK_HI8;
-  else if (ModifierKind == AVRMCExpr::VK_HH8)
-    Kind = AVRMCExpr::VK_HH8;
+      Kind = AVR::S_DIFF16;
+  } else if (ModifierKind == AVR::S_LO8)
+    Kind = AVR::S_LO8;
+  else if (ModifierKind == AVR::S_HI8)
+    Kind = AVR::S_HI8;
+  else if (ModifierKind == AVR::S_HH8)
+    Kind = AVR::S_HH8;
   MCELFStreamer::emitValue(
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VariantKind(Kind),
                               getContext()),
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 2d45de083583c..88352337524ad 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 #define LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -41,9 +41,10 @@ class AVRMCELFStreamer : public MCELFStreamer {
                       std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
-  void emitValueForModiferKind(
-      const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc = SMLoc(),
-      AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE);
+  void
+  emitValueForModiferKind(const MCSymbol *Sym, unsigned SizeInBytes,
+                          SMLoc Loc = SMLoc(),
+                          AVRMCExpr::Specifier ModifierKind = AVR::S_AVR_NONE);
 };
 
 MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5be799093d2c1..5963976d0dc7f 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,208 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
 
 namespace llvm {
 
-namespace {
-
-const struct ModifierEntry {
-  const char *const Spelling;
-  AVRMCExpr::Specifier specifier;
-} ModifierNames[] = {
-    {"lo8", AVRMCExpr::VK_LO8},       {"hi8", AVRMCExpr::VK_HI8},
-    {"hh8", AVRMCExpr::VK_HH8}, // synonym with hlo8
-    {"hlo8", AVRMCExpr::VK_HH8},      {"hhi8", AVRMCExpr::VK_HHI8},
-
-    {"pm", AVRMCExpr::VK_PM},         {"pm_lo8", AVRMCExpr::VK_PM_LO8},
-    {"pm_hi8", AVRMCExpr::VK_PM_HI8}, {"pm_hh8", AVRMCExpr::VK_PM_HH8},
-
-    {"lo8_gs", AVRMCExpr::VK_LO8_GS}, {"hi8_gs", AVRMCExpr::VK_HI8_GS},
-    {"gs", AVRMCExpr::VK_GS},
-};
-
-} // end of anonymous namespace
-
 const AVRMCExpr *AVRMCExpr::create(Specifier Kind, const MCExpr *Expr,
                                    bool Negated, MCContext &Ctx) {
   return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
 }
 
-void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  assert(specifier != VK_AVR_NONE);
-  OS << getName() << '(';
-  if (isNegated())
-    OS << '-' << '(';
-  MAI->printExpr(OS, *getSubExpr());
-  if (isNegated())
-    OS << ')';
-  OS << ')';
-}
-
-bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
-  MCValue Value;
-
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
-
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = evaluateAsInt64(Value.getConstant());
-    return true;
-  }
-
-  return false;
-}
-
-bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
-                                          const MCAssembler *Asm) const {
-  MCValue Value;
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
-  } else {
-    if (!Asm || !Asm->hasLayout())
-      return false;
-
-    auto Spec = AVRMCExpr::VK_None;
-    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
-      return false;
-    assert(!Value.getSubSym());
-    if (specifier == VK_PM)
-      Spec = AVRMCExpr::VK_PM;
-
-    // TODO: don't attach specifier to MCSymbolRefExpr.
-    Result =
-        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
-  }
-
-  return true;
-}
-
-int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
-  if (Negated)
-    Value *= -1;
-
-  switch (specifier) {
-  case AVRMCExpr::VK_LO8:
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_HI8:
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_HH8:
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_HHI8:
-    Value &= 0xff000000;
-    Value >>= 24;
-    break;
-  case AVRMCExpr::VK_PM_LO8:
-  case AVRMCExpr::VK_LO8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_PM_HI8:
-  case AVRMCExpr::VK_HI8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_PM_HH8:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_PM:
-  case AVRMCExpr::VK_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    break;
-
-  case AVRMCExpr::VK_AVR_NONE:
-  default:
-    llvm_unreachable("Uninitialized expression.");
-  }
-  return static_cast<uint64_t>(Value) & 0xff;
-}
-
-AVR::Fixups AVRMCExpr::getFixupKind() const {
-  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
-
-  switch (specifier) {
-  case VK_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
-    break;
-  case VK_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
-    break;
-  case VK_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
-    break;
-  case VK_HHI8:
-    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
-    break;
-
-  case VK_PM_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
-    break;
-  case VK_PM_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
-    break;
-  case VK_PM_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
-    break;
-  case VK_PM:
-  case VK_GS:
-    Kind = AVR::fixup_16_pm;
-    break;
-  case VK_LO8_GS:
-    Kind = AVR::fixup_lo8_ldi_gs;
-    break;
-  case VK_HI8_GS:
-    Kind = AVR::fixup_hi8_ldi_gs;
-    break;
-
-  default:
-    llvm_unreachable("Uninitialized expression");
-  }
-
-  return Kind;
-}
-
-const char *AVRMCExpr::getName() const {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
-        return Mod.specifier == specifier;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->Spelling;
-  }
-  return nullptr;
-}
-
-AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
-        return Mod.Spelling == Name;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->specifier;
-  }
-  return VK_AVR_NONE;
-}
-
-} // end of namespace llvm
+} // namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index 69c60cde1f746..5592e24be5378 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -18,31 +18,9 @@ namespace llvm {
 /// A expression in AVR machine code.
 class AVRMCExpr : public MCSpecifierExpr {
 public:
+  friend class AVRMCAsmInfo;
   using Specifier = Spec;
   /// Specifies the type of an expression.
-  enum {
-    VK_None,
-
-    VK_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
-
-    VK_HI8,  ///< Corresponds to `hi8()`.
-    VK_LO8,  ///< Corresponds to `lo8()`.
-    VK_HH8,  ///< Corresponds to `hlo8() and hh8()`.
-    VK_HHI8, ///< Corresponds to `hhi8()`.
-
-    VK_PM,     ///< Corresponds to `pm()`, reference to program memory.
-    VK_PM_LO8, ///< Corresponds to `pm_lo8()`.
-    VK_PM_HI8, ///< Corresponds to `pm_hi8()`.
-    VK_PM_HH8, ///< Corresponds to `pm_hh8()`.
-
-    VK_LO8_GS, ///< Corresponds to `lo8(gs())`.
-    VK_HI8_GS, ///< Corresponds to `hi8(gs())`.
-    VK_GS,     ///< Corresponds to `gs()`.
-
-    VK_DIFF8,
-    VK_DIFF16,
-    VK_DIFF32,
-  };
 
 public:
   /// Creates an AVR machine code expression.
@@ -59,10 +37,6 @@ class AVRMCExpr : public MCSpecifierExpr {
   bool isNegated() const { return Negated; }
   void setNegated(bool negated = true) { Negated = negated; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
 public:
   static Specifier parseSpecifier(StringRef Name);
 
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index f87fb70f97ff0..d29a7a56167c9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -87,7 +88,7 @@ static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
   return new AVRTargetAsmStreamer(S);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<AVRMCAsmInfo> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
index dd61add1526cf..d81db50650ba7 100644
--- a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
+++ b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AVRTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 namespace llvm {
 Target &getTheAVRTarget() {
   static Target TheAVRTarget;
@@ -15,7 +16,8 @@ Target &getTheAVRTarget() {
 }
 } // namespace llvm
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRTargetInfo() {
   llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
                                             "Atmel AVR Microcontroller", "AVR");
 }
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 139ac429dd135..b49e8fd96c66a 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -261,7 +262,6 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("bswap32", true)
         .Case("bswap64", true)
         .Case("goto", true)
-        .Case("gotol", true)
         .Case("ll", true)
         .Case("skb", true)
         .Case("s", true)
@@ -533,7 +533,7 @@ bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
   RegisterMCAsmParser<BPFAsmParser> X(getTheBPFTarget());
   RegisterMCAsmParser<BPFAsmParser> Y(getTheBPFleTarget());
   RegisterMCAsmParser<BPFAsmParser> Z(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 5dd71cc91427a..e3843e0e112e2 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -155,7 +156,8 @@ INITIALIZE_PASS(BPFAsmPrinter, "bpf-asm-printer", "BPF Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFAsmPrinter() {
   RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Y(getTheBPFbeTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Z(getTheBPFTarget());
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 0c3f61fdfedd6..527a480354571 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
@@ -41,7 +42,7 @@ static cl::opt<bool>
     DisableCheckUnreachable("bpf-disable-trap-unreachable", cl::Hidden,
                             cl::desc("Disable Trap Unreachable for BPF"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   // Register the target.
   RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
   RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 536bee5393843..4dfae81e90191 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <cstdint>
@@ -82,8 +83,8 @@ static MCDisassembler *createBPFDisassembler(const Target &T,
   return new BPFDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheBPFTarget(),
                                          createBPFDisassembler);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index caf84701b999f..5f44dd9583aff 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Host.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -104,7 +105,7 @@ static MCInstrAnalysis *createBPFInstrAnalysis(const MCInstrInfo *Info) {
   return new BPFMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
   for (Target *T :
        {&getTheBPFleTarget(), &getTheBPFbeTarget(), &getTheBPFTarget()}) {
     // Register the MC asm info.
@@ -153,5 +154,4 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
     TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
                                          createBPFbeAsmBackend);
   }
-
 }
diff --git a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index d7cdcae916aaf..6ea6cd56a6d05 100644
--- a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/BPFTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheBPFTarget() {
   return TheBPFTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFTargetInfo() {
   TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
                                  "BPF", [](Triple::ArchType) { return false; },
                                  true);
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 035899205bf8c..94b2dbe78c4f7 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -24,7 +24,9 @@ static bool finalizeLinkage(Module &M) {
   for (Function &EF : M.functions()) {
     if (EF.isIntrinsic())
       continue;
-    if (EF.hasFnAttribute("hlsl.shader") || EF.hasFnAttribute("hlsl.export"))
+    if (EF.hasExternalLinkage() && EF.hasDefaultVisibility())
+      continue;
+    if (EF.hasFnAttribute("hlsl.shader"))
       continue;
     Funcs.push_back(&EF);
   }
diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index b1f3f41a28e8b..0b7cf2f970172 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -86,6 +86,13 @@ class DXILFlattenArraysVisitor
   Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices,
                                       ArrayRef<uint64_t> Dims,
                                       IRBuilder<> &Builder);
+
+  // Helper function to collect indices and dimensions from a GEP instruction
+  void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    SmallVectorImpl<uint64_t> &Dims,
+                                    bool &AllIndicesAreConstInt);
+
   void
   recursivelyCollectGEPs(GetElementPtrInst &CurrGEP,
                          ArrayType *FlattenedArrayType, Value *PtrOperand,
@@ -218,6 +225,26 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
   return true;
 }
 
+void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP(
+    GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices,
+    SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) {
+
+  Type *CurrentType = GEP.getSourceElementType();
+
+  // Note index 0 is the ptr index.
+  for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) {
+    Indices.push_back(Index);
+    AllIndicesAreConstInt &= isa<ConstantInt>(Index);
+
+    if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) {
+      Dims.push_back(ArrayTy->getNumElements());
+      CurrentType = ArrayTy->getElementType();
+    } else {
+      assert(false && "Expected array type in GEP chain");
+    }
+  }
+}
+
 void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
     GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
     Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
@@ -226,12 +253,8 @@ void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
   if (GEPChainMap.count(&CurrGEP) > 0)
     return;
 
-  Value *LastIndex = CurrGEP.getOperand(CurrGEP.getNumOperands() - 1);
-  AllIndicesAreConstInt &= isa<ConstantInt>(LastIndex);
-  Indices.push_back(LastIndex);
-  assert(isa<ArrayType>(CurrGEP.getSourceElementType()));
-  Dims.push_back(
-      cast<ArrayType>(CurrGEP.getSourceElementType())->getNumElements());
+  // Collect indices and dimensions from the current GEP
+  collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt);
   bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType());
   if (!IsMultiDimArr) {
     assert(GEPChainUseCount < FlattenedArrayType->getNumElements());
@@ -316,9 +339,12 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Handle zero uses here because there won't be an update via
   // a child in the chain later.
   if (GEPChainUseCount == 0) {
-    SmallVector<Value *> Indices({GEP.getOperand(GEP.getNumOperands() - 1)});
-    SmallVector<uint64_t> Dims({ArrType->getNumElements()});
-    bool AllIndicesAreConstInt = isa<ConstantInt>(Indices[0]);
+    SmallVector<Value *> Indices;
+    SmallVector<uint64_t> Dims;
+    bool AllIndicesAreConstInt = true;
+
+    // Collect indices and dimensions from the GEP
+    collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt);
     GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand,
                     std::move(Indices), std::move(Dims), AllIndicesAreConstInt};
     return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index e0068787f5e5a..cb58f4833631d 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -148,9 +148,49 @@ class DXILPrepareModule : public ModulePass {
                                      Type *Ty) {
     // Omit bitcasts if the incoming value matches the instruction type.
     auto It = PointerTypes.find(Operand);
-    if (It != PointerTypes.end())
-      if (cast<TypedPointerType>(It->second)->getElementType() == Ty)
+    if (It != PointerTypes.end()) {
+      auto *OpTy = cast<TypedPointerType>(It->second)->getElementType();
+      if (OpTy == Ty)
         return nullptr;
+    }
+
+    Type *ValTy = Operand->getType();
+    // Also omit the bitcast for matching global array types
+    if (auto *GlobalVar = dyn_cast<GlobalVariable>(Operand))
+      ValTy = GlobalVar->getValueType();
+
+    if (auto *AI = dyn_cast<AllocaInst>(Operand))
+      ValTy = AI->getAllocatedType();
+
+    if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+      Type *ElTy = ArrTy->getElementType();
+      if (ElTy == Ty)
+        return nullptr;
+    }
+
+    // finally, drill down GEP instructions until we get the array
+    // that is being accessed, and compare element types
+    if (ConstantExpr *GEPInstr = dyn_cast<ConstantExpr>(Operand)) {
+      while (GEPInstr->getOpcode() == Instruction::GetElementPtr) {
+        Value *OpArg = GEPInstr->getOperand(0);
+        if (ConstantExpr *NewGEPInstr = dyn_cast<ConstantExpr>(OpArg)) {
+          GEPInstr = NewGEPInstr;
+          continue;
+        }
+
+        if (auto *GlobalVar = dyn_cast<GlobalVariable>(OpArg))
+          ValTy = GlobalVar->getValueType();
+        if (auto *AI = dyn_cast<AllocaInst>(Operand))
+          ValTy = AI->getAllocatedType();
+        if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+          Type *ElTy = ArrTy->getElementType();
+          if (ElTy == Ty)
+            return nullptr;
+        }
+        break;
+      }
+    }
+
     // Insert bitcasts where we are removing the instruction.
     Builder.SetInsertPoint(&Inst);
     // This code only gets hit in opaque-pointer mode, so the type of the
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index f9b4bc0d14fd9..c423dca90a4ab 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
@@ -878,7 +879,8 @@ bool HexagonAsmParser::RegisterMatchesArch(MCRegister MatchNum) const {
 // extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmLexer();
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmParser() {
   RegisterMCAsmParser<HexagonAsmParser> X(getTheHexagonTarget());
 }
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 98b711f6b014b..5bd31707acb6f 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -164,7 +165,8 @@ static MCDisassembler *createHexagonDisassembler(const Target &T,
   return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheHexagonTarget(),
                                          createHexagonDisassembler);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index c7580d28618ab..f22852d1ef557 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -858,6 +859,7 @@ char HexagonAsmPrinter::ID = 0;
 INITIALIZE_PASS(HexagonAsmPrinter, "hexagon-asm-printer",
                 "Hexagon Assembly Printer", false, false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 9604f252dd3df..c2eb24b482d44 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2318,7 +2318,7 @@ bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
   // instructions in it that are not involved in the original set Insts.
   for (auto *B : L->blocks()) {
     for (auto &In : *B) {
-      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+      if (isa<BranchInst>(In))
         continue;
       if (!Worklist.count(&In) && In.mayHaveSideEffects())
         return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 19b7c6a315f56..66508fd767793 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -174,7 +175,8 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTarget() {
   // Register the target.
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 91051cd4e2d51..980df819b2c26 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/raw_ostream.h"
@@ -775,7 +776,8 @@ static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheHexagonTarget(), createHexagonMCAsmInfo);
 
diff --git a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index ef9f9fd337fac..34a7b945ca516 100644
--- a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheHexagonTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheHexagonTarget() {
   return TheHexagonTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetInfo() {
   RegisterTarget<Triple::hexagon, /*HasJIT=*/true> X(
       getTheHexagonTarget(), "hexagon", "Hexagon", "Hexagon");
 }
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 9cb7f71945d1d..6a74686a239d0 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -1223,6 +1224,7 @@ bool LanaiAsmParser::parseInstruction(ParseInstructionInfo & /*Info*/,
 #define GET_MATCHER_IMPLEMENTATION
 #include "LanaiGenAsmMatcher.inc"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmParser() {
   RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 2720e1d9a6a64..5d87c3c4d72cf 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -35,7 +36,8 @@ static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
   return new LanaiDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiDisassembler() {
   // Register the disassembler
   TargetRegistry::RegisterMCDisassembler(getTheLanaiTarget(),
                                          createLanaiDisassembler);
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 1c4fc572243c5..24e4fc3f53e63 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "asm-printer"
@@ -242,6 +243,7 @@ INITIALIZE_PASS(LanaiAsmPrinter, "lanai-asm-printer", "Lanai Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmPrinter() {
   RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 7f94e778e7545..3d6ba9ecc55e2 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -21,12 +21,13 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
   // Register the target.
   RegisterTargetMachine<LanaiTargetMachine> registered_target(
       getTheLanaiTarget());
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 4a381c033b384..687386c6962be 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
@@ -126,7 +127,8 @@ static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
   return new LanaiMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<LanaiMCAsmInfo> X(getTheLanaiTarget());
 
diff --git a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 5c63df670938f..f56591a45f8f8 100644
--- a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,8 @@ Target &llvm::getTheLanaiTarget() {
   return TheLanaiTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetInfo() {
   RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai",
                                   "Lanai");
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 7d58270089575..a8fed951b0cfa 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -1953,7 +1954,8 @@ ParseStatus LoongArchAsmParser::parseDirective(AsmToken DirectiveID) {
   return ParseStatus::NoMatch;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmParser() {
   RegisterMCAsmParser<LoongArchAsmParser> X(getTheLoongArch32Target());
   RegisterMCAsmParser<LoongArchAsmParser> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index 761682423fffe..8c4668ec70c7e 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -45,7 +46,8 @@ static MCDisassembler *createLoongArchDisassembler(const Target &T,
   return new LoongArchDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheLoongArch32Target(),
                                          createLoongArchDisassembler);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 64ac7c03c0419..b757d123fa0ff 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -309,7 +310,8 @@ INITIALIZE_PASS(LoongArchAsmPrinter, "loongarch-asm-printer",
                 "LoongArch Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmPrinter() {
   RegisterAsmPrinter<LoongArchAsmPrinter> X(getTheLoongArch32Target());
   RegisterAsmPrinter<LoongArchAsmPrinter> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d63e5a2b50e84..c36db9c75dd3a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -29,7 +30,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarch"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTarget() {
   // Register the target.
   RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index d7569ab0ea597..b1491b75ac5bc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -429,6 +429,26 @@ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
   return true;
 }
 
+bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
+                                               const MCFragment &F) {
+  // If the section does not contain linker-relaxable fragments, PC-relative
+  // fixups can be resolved.
+  if (!F.getParent()->isLinkerRelaxable())
+    return true;
+
+  // Otherwise, check if the offset between the symbol and fragment is fully
+  // resolved, unaffected by linker-relaxable fragments (e.g. instructions or
+  // offset-affected MCAlignFragment). Complements the generic
+  // isSymbolRefDifferenceFullyResolvedImpl.
+  if (!PCRelTemp)
+    PCRelTemp = getContext().createTempSymbol();
+  PCRelTemp->setFragment(const_cast<MCFragment *>(&F));
+  MCValue Res;
+  MCExpr::evaluateSymbolicAdd(Asm, false, MCValue::get(SymA),
+                              MCValue::get(nullptr, PCRelTemp), Res);
+  return !Res.getSubSym();
+}
+
 bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                    const MCValue &Target, uint64_t &FixedValue,
                                    bool IsResolved) {
@@ -447,19 +467,24 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     if (!force) {
       const MCSection &SecA = SA.getSection();
       const MCSection &SecB = SB.getSection();
-
-      // We need record relocation if SecA != SecB. Usually SecB is same as the
-      // section of Fixup, which will be record the relocation as PCRel. If SecB
-      // is not same as the section of Fixup, it will report error. Just return
-      // false and then this work can be finished by handleFixup.
-      if (&SecA != &SecB)
+      const MCSection &SecCur = *F.getParent();
+
+      // To handle the case of A - B which B is same section with the current,
+      // generate PCRel relocations is better than ADD/SUB relocation pair.
+      // We can resolve it as A - PC + PC - B. The A - PC will be resolved
+      // as a PCRel relocation, while PC - B will serve as the addend.
+      // If the linker relaxation is disabled, it can be done directly since
+      // PC - B is constant. Otherwise, we should evaluate whether PC - B
+      // is constant. If it can be resolved as PCRel, use Fallback which
+      // generates R_LARCH_{32,64}_PCREL relocation later.
+      if (&SecA != &SecB && &SecB == &SecCur &&
+          isPCRelFixupResolved(Target.getSubSym(), F))
         return Fallback();
 
-      // In SecA == SecB case. If the linker relaxation is enabled, we need
-      // record the ADD, SUB relocations. Otherwise the FixedValue has already
-      // been calc- ulated out in evaluateFixup, return true and avoid record
-      // relocations.
-      if (!STI.hasFeature(LoongArch::FeatureRelax))
+      // In SecA == SecB case. If the linker relaxation is disabled, the
+      // FixedValue has already been calculated out in evaluateFixup,
+      // return true and avoid record relocations.
+      if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax))
         return true;
     }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index aeedafe2b44b4..56554c5c664eb 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -30,6 +30,10 @@ class LoongArchAsmBackend : public MCAsmBackend {
   bool Is64Bit;
   const MCTargetOptions &TargetOptions;
   DenseMap<MCSection *, const MCSymbolRefExpr *> SecToAlignSym;
+  // Temporary symbol used to check whether a PC-relative fixup is resolved.
+  MCSymbol *PCRelTemp = nullptr;
+
+  bool isPCRelFixupResolved(const MCSymbol *SymA, const MCFragment &F);
 
 public:
   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 3ec070e5cbdd3..35277ce094a7d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -204,7 +204,8 @@ MCStreamer *createLoongArchELFStreamer(const Triple &T, MCContext &Context,
 }
 } // end namespace
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetMC() {
   for (Target *T : {&getTheLoongArch32Target(), &getTheLoongArch64Target()}) {
     TargetRegistry::RegisterMCRegInfo(*T, createLoongArchMCRegisterInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createLoongArchMCInstrInfo);
diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
index 1d6be4069b71e..a7a5c25de3233 100644
--- a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
+++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LoongArchTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheLoongArch32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheLoongArch64Target() {
   return TheLoongArch64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetInfo() {
   RegisterTarget<Triple::loongarch32, /*HasJIT=*/false> X(
       getTheLoongArch32Target(), "loongarch32", "32-bit LoongArch",
       "LoongArch");
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index c61b8adf89ab4..5a4121f7cafd7 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "msp430-asm-parser"
@@ -534,7 +535,8 @@ bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmParser() {
   RegisterMCAsmParser<MSP430AsmParser> X(getTheMSP430Target());
 }
 
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index 519bba763204f..4c5b473982f77 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -57,7 +58,8 @@ static MCDisassembler *createMSP430Disassembler(const Target &T,
   return new MSP430Disassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheMSP430Target(),
                                          createMSP430Disassembler);
 }
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index df182a5459ead..2cb515aef11e4 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -80,7 +81,8 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
   return nullptr;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetMC() {
   Target &T = getTheMSP430Target();
 
   TargetRegistry::RegisterMCAsmInfo(T, createMSP430MCAsmInfo);
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 44e55b6a3c9b7..44eea8149c594 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -189,6 +190,7 @@ INITIALIZE_PASS(MSP430AsmPrinter, "msp430-asm-printer",
                 "MSP430 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
 }
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 763a2db2baca7..e6024f4a62185 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
   // Register the target.
   RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index fc2b38f41c141..a6170b82e1f49 100644
--- a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MSP430TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMSP430Target() {
@@ -15,7 +16,8 @@ Target &llvm::getTheMSP430Target() {
   return TheMSP430Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetInfo() {
   RegisterTarget<Triple::msp430> X(getTheMSP430Target(), "msp430",
                                    "MSP430 [experimental]", "MSP430");
 }
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 7ea7c58f1a512..b559a8b896e0f 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2965,9 +2965,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
         Res.getConstant() == 0 && !IsLocalSym) {
       if (UseXGOT) {
         const MCExpr *CallHiExpr =
-            MipsMCExpr::create(Mips::S_CALL_HI16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_HI16, getContext());
         const MCExpr *CallLoExpr =
-            MipsMCExpr::create(Mips::S_CALL_LO16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_LO16, getContext());
         TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                     STI);
         TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
@@ -2976,7 +2976,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                      MCOperand::createExpr(CallLoExpr), IDLoc, STI);
       } else {
         const MCExpr *CallExpr =
-            MipsMCExpr::create(Mips::S_GOT_CALL, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_GOT_CALL, getContext());
         TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
                      MCOperand::createExpr(CallExpr), IDLoc, STI);
       }
@@ -3009,9 +3009,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       const MCExpr *CallHiExpr =
-          MipsMCExpr::create(Mips::S_GOT_HI16, SymExpr, getContext());
-      const MCExpr *CallLoExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_LO16, getContext());
+          MCSpecifierExpr::create(SymExpr, Mips::S_GOT_HI16, getContext());
+      const MCExpr *CallLoExpr = MCSpecifierExpr::create(
+          Res.getAddSym(), Mips::S_GOT_LO16, getContext());
 
       TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                   STI);
@@ -3042,8 +3042,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // The daddiu's marked with a '>' may be omitted if they are redundant. If
       // this happens then the last instruction must use $rd as the result
       // register.
-      GotExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_DISP, getContext());
+      GotExpr = MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT_DISP,
+                                        getContext());
       if (Res.getConstant() != 0) {
         // Symbols fully resolve with just the %got_disp(symbol) but we
         // must still account for any offset to the symbol for
@@ -3070,14 +3070,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       if (IsLocalSym) {
-        GotExpr = MipsMCExpr::create(Mips::S_GOT, SymExpr, getContext());
-        LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+        GotExpr = MCSpecifierExpr::create(SymExpr, Mips::S_GOT, getContext());
+        LoExpr = MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
       } else {
         // External symbols fully resolve the symbol with just the %got(symbol)
         // but we must still account for any offset to the symbol for
         // expressions like symbol+8.
         GotExpr =
-            MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
+            MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
         if (Res.getConstant() != 0)
           LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
@@ -3097,8 +3097,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
-  const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+  const auto *HiExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_HI, getContext());
+  const auto *LoExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3110,9 +3112,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // source register.
 
     const auto *HighestExpr =
-        MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHEST, getContext());
     const auto *HigherExpr =
-        MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHER, getContext());
 
     bool RdRegIsRsReg =
         UseSrcReg &&
@@ -3310,7 +3312,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
 
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *GotExpr = MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
+    const auto *GotExpr =
+        MCSpecifierExpr::create(GotSym, Mips::S_GOT, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3321,7 +3324,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     }
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
+    const auto *HiExpr =
+        MCSpecifierExpr::create(HiSym, Mips::S_HI, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3334,10 +3338,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HighestExpr =
-          MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
+          MCSpecifierExpr::create(HighestSym, Mips::S_HIGHEST, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HigherExpr =
-          MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
+          MCSpecifierExpr::create(HigherSym, Mips::S_HIGHER, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
                   STI);
@@ -3424,7 +3428,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3474,7 +3478,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3554,7 +3558,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3777,15 +3781,15 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       //                  sw  $8,  %lo(sym)($at)
       const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_LO, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_LO, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_HI, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_HI, getContext()));
 
       if (ABI.IsN64()) {
         MCOperand HighestOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHEST, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHEST, getContext()));
         MCOperand HigherOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHER, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHER, getContext()));
 
         TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
         TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
@@ -6394,7 +6398,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
   while (Ops.size()) {
     if (Parser.parseToken(AsmToken::RParen, "expected ')'"))
       return nullptr;
-    Res = MipsMCExpr::create(Ops.pop_back_val(), Res, getContext());
+    Res = MCSpecifierExpr::create(Res, Ops.pop_back_val(), getContext());
   }
   return Res;
 }
@@ -8902,7 +8906,8 @@ bool MipsAsmParser::parseInternalDirectiveReallowModule() {
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmParser() {
   RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
   RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
   RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 2a3a8eac2e9af..b3f6cd1609fbb 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -503,7 +503,8 @@ static MCDisassembler *createMipselDisassembler(
   return new MipsDisassembler(STI, Ctx, false);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
                                          createMipsDisassembler);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index d3f16e5042c3a..8b73a7bdd4bc1 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMMipsDesc
   MipsInstPrinter.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
-  MipsMCExpr.cpp
   MipsMCTargetDesc.cpp
   MipsNaClELFStreamer.cpp
   MipsOptionRecord.cpp
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 58aa374e5302d..25e31941bbb45 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index b64f86f382974..0941d93fe0eb6 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCAsmInfo.h"
 #include "MipsABIInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -59,6 +60,13 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   AllowAtInName = true;
 }
 
+const MCSpecifierExpr *Mips::createGpOff(const MCExpr *Expr, Mips::Specifier S,
+                                         MCContext &Ctx) {
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_GPREL, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_NEG, Ctx);
+  return MCSpecifierExpr::create(Expr, S, Ctx);
+}
+
 static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
                       const MCSpecifierExpr &Expr) {
   int64_t AbsVal;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 0975116328fc1..6ba90a5c20257 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
-#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCFixup.h"
@@ -77,6 +76,8 @@ enum {
 };
 
 bool isGpOff(const MCSpecifierExpr &E);
+const MCSpecifierExpr *createGpOff(const MCExpr *Expr, Specifier S,
+                                   MCContext &Ctx);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index d2981c4ad4d20..35d4e0db35c31 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -581,7 +581,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                const MCSubtargetInfo &STI) const {
   MCExpr::ExprKind Kind = Expr->getKind();
   if (Kind == MCExpr::Specifier) {
-    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+    const auto *MipsExpr = cast<MCSpecifierExpr>(Expr);
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getSpecifier()) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
deleted file mode 100644
index 821f662f0cbfb..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsMCExpr.h"
-#include "MCTargetDesc/MipsMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mipsmcexpr"
-
-const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::Specifier S,
-                                     const MCExpr *Expr, MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(Expr, S);
-}
-
-const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S,
-                                     MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
-const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
-                                          const MCExpr *Expr, MCContext &Ctx) {
-  return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
-                Ctx);
-}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
deleted file mode 100644
index b78aeabb57992..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class MipsMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = Spec;
-
-private:
-  explicit MipsMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const MipsMCExpr *create(Specifier S, const MCExpr *Expr,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *create(const MCSymbol *Sym, Specifier S,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr,
-                                       MCContext &Ctx);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index add36d87b9eff..29f61ed9b2b83 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -259,7 +260,7 @@ static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MipsMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
   for (Target *T : {&getTheMipsTarget(), &getTheMipselTarget(),
                     &getTheMips64Target(), &getTheMips64elTarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 80a854c799014..6097ad8017846 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -16,7 +16,6 @@
 #include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
-#include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1266,9 +1265,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   MCInst TmpInst;
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *HiSym = MipsMCExpr::create(
-      Mips::S_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *HiSym = MCSpecifierExpr::create(GP_Disp, Mips::S_HI, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1277,9 +1274,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDiu);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *LoSym = MipsMCExpr::create(
-      Mips::S_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *LoSym = MCSpecifierExpr::create(GP_Disp, Mips::S_LO, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1342,12 +1337,12 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
-      Mips::S_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
-  const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
-      Mips::S_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
+  auto *HiExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_HI, MCA.getContext());
+  auto *LoExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_LO, MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
   emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index da3f7cb55b301..87e06a6d3c08a 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -55,6 +55,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -1244,7 +1245,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
 void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
-  if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
+  if (auto *MipsExpr = dyn_cast<MCSpecifierExpr>(Value)) {
     if (MipsExpr && MipsExpr->getSpecifier() == Mips::S_DTPREL) {
       switch (Size) {
       case 4:
@@ -1299,7 +1300,8 @@ INITIALIZE_PASS(MipsAsmPrinter, "mips-asm-printer", "Mips Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
   RegisterAsmPrinter<MipsAsmPrinter> Y(getTheMipselTarget());
   RegisterAsmPrinter<MipsAsmPrinter> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 935fcd8fa7154..cdf58384427f2 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -175,9 +175,9 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   if (IsGpOff)
-    Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+    Expr = Mips::createGpOff(Expr, TargetKind, *Ctx);
   else if (TargetKind != Mips::S_None)
-    Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
+    Expr = MCSpecifierExpr::create(Expr, TargetKind, *Ctx);
 
   return MCOperand::createExpr(Expr);
 }
@@ -216,7 +216,7 @@ MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
 
-  return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
+  return MCOperand::createExpr(MCSpecifierExpr::create(Sub, Kind, *Ctx));
 }
 
 void MipsMCInstLower::
@@ -248,7 +248,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   if (MI->getNumOperands() == 2) {
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 3) {
     // Create %hi($tgt-$baltgt).
@@ -290,7 +290,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
     // Lower register operand.
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 4) {
     // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 30b4d506c5caa..8c519fa379dd8 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
@@ -51,7 +52,7 @@ static cl::opt<bool>
     EnableMulMulFix("mfix4300", cl::init(false),
                     cl::desc("Enable the VR4300 mulmul bug fix."), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   // Register the target.
   RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
   RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 23aa699318a2e..78a9f3b7cc71b 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -189,5 +189,5 @@ MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
-  return MipsMCExpr::create(Mips::S_DTPREL, Expr, getContext());
+  return MCSpecifierExpr::create(Expr, Mips::S_DTPREL, getContext());
 }
diff --git a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index db5f607bbb4f5..458032042e15f 100644
--- a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMipsTarget() {
@@ -27,7 +28,8 @@ Target &llvm::getTheMips64elTarget() {
   return TheMips64elTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsTargetInfo() {
   RegisterTarget<Triple::mips,
                  /*HasJIT=*/true>
       X(getTheMipsTarget(), "mips", "MIPS (32-bit big endian)", "Mips");
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 1cafd236a2925..cb7132b5f3042 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -71,7 +72,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetMC() {
   for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
     // Register the MC asm info.
     RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b4e2c46b94440..9af6fb2cb198e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -77,6 +77,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/NativeFormatting.h"
@@ -1947,7 +1948,8 @@ INITIALIZE_PASS(NVPTXAsmPrinter, "nvptx-asm-printer", "NVPTX Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
   RegisterAsmPrinter<NVPTXAsmPrinter> Y(getTheNVPTXTarget64());
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 79b1bfbc8072b..ff10eea371049 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     break;
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
-    if (tryLDGLDU(N))
+    if (tryLDU(N))
       return;
     break;
   case NVPTXISD::StoreV2:
@@ -324,7 +324,7 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_p:
-    return tryLDGLDU(N);
+    return tryLDU(N);
 
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
@@ -1048,35 +1048,28 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   assert(LD->readMem() && "Expected load");
 
   // do not support pre/post inc/dec
-  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
+  const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
   if (PlainLoad && PlainLoad->isIndexed())
     return false;
 
-  EVT LoadedVT = LD->getMemoryVT();
-  if (!LoadedVT.isSimple())
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
     return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
   // Address Space Setting
   const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+    return tryLDG(LD);
 
-  SDLoc DL(N);
+  SDLoc DL(LD);
   SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
-  // Type Setting: fromType + fromTypeWidth
-  //
-  // Sign   : ISD::SEXTLOAD
-  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
-  //          type is integer
-  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  // Read at least 8 bits (predicates are stored as 8-bit values)
-  unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
+  const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
 
   // Vector Setting
-  unsigned int FromType =
+  const unsigned FromType =
       (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
           ? NVPTX::PTXLdStInstCode::Signed
           : NVPTX::PTXLdStInstCode::Untyped;
@@ -1102,29 +1095,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!Opcode)
     return false;
 
-  SDNode *NVPTXLD =
-      CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
   if (!NVPTXLD)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = LD->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, NVPTXLD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-static bool isSubVectorPackedInI32(EVT EltVT) {
-  // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
-  // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
-  // vectorized loads/stores with the actual element type for i8/i16 as that
-  // would require v8/v16 variants that do not exist.
-  // In order to load/store such vectors efficiently, in Type Legalization
-  // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
-  // lower to PTX as vectors of b32.
-  return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
-}
-
 static unsigned getLoadStoreVectorNumElts(SDNode *N) {
   switch (N->getOpcode()) {
   case NVPTXISD::LoadV2:
@@ -1142,21 +1123,21 @@ static unsigned getLoadStoreVectorNumElts(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  const EVT MemEVT = MemSD->getMemoryVT();
+  MemSDNode *LD = cast<MemSDNode>(N);
+  const EVT MemEVT = LD->getMemoryVT();
   if (!MemEVT.isSimple())
     return false;
   const MVT MemVT = MemEVT.getSimpleVT();
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
-  if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+  if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+    return tryLDG(LD);
 
-  EVT EltVT = N->getValueType(0);
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  const MVT EltVT = LD->getSimpleValueType(0);
+  SDLoc DL(LD);
+  SDValue Chain = LD->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
   // Type Setting: fromType + fromTypeWidth
   //
@@ -1167,18 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   // Read at least 8 bits (predicates are stored as 8-bit values)
   // The last operand holds the original LoadSDNode::getExtensionType() value
   const unsigned TotalWidth = MemVT.getSizeInBits();
-  unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
-  unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
-                          ? NVPTX::PTXLdStInstCode::Signed
-                          : NVPTX::PTXLdStInstCode::Untyped;
+  const unsigned ExtensionType =
+      N->getConstantOperandVal(N->getNumOperands() - 1);
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
-
-  if (isSubVectorPackedInI32(EltVT)) {
-    assert(ExtensionType == ISD::NON_EXTLOAD);
-    EltVT = MVT::i32;
-  }
+  const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
 
+  assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
   assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
          FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
@@ -1196,192 +1174,183 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case NVPTXISD::LoadV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
-                             NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2,
-                             NVPTX::LDV_i64_v2);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2,
+                        NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
     break;
   case NVPTXISD::LoadV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
-                             NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4,
-                             NVPTX::LDV_i64_v4);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4,
+                        NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
     break;
   case NVPTXISD::LoadV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LDV_i32_v8, {/* no v8i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
+  MachineMemOperand *MemRef = LD->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, LD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
-  auto *Mem = cast<MemSDNode>(N);
-
-  // If this is an LDG intrinsic, the address is the third operand. If its an
-  // LDG/LDU SD node (from custom vector handling), then its the second operand
-  SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
+    return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
-  const EVT OrigType = N->getValueType(0);
-  EVT EltVT = Mem->getMemoryVT();
-  unsigned NumElts = 1;
+  SDLoc DL(LD);
 
-  if (EltVT == MVT::i128 || EltVT == MVT::f128) {
-    EltVT = MVT::i64;
-    NumElts = 2;
-  }
-  if (EltVT.isVector()) {
-    NumElts = EltVT.getVectorNumElements();
-    EltVT = EltVT.getVectorElementType();
-    // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
-    // elements.
-    if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
-        (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
-        (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
-        (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
-      assert(NumElts % OrigType.getVectorNumElements() == 0 &&
-             "NumElts must be divisible by the number of elts in subvectors");
-      EltVT = OrigType;
-      NumElts /= OrigType.getVectorNumElements();
-    }
+  const unsigned TotalWidth = LoadedVT.getSizeInBits();
+  unsigned ExtensionType;
+  unsigned NumElts;
+  if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
+    ExtensionType = Load->getExtensionType();
+    NumElts = 1;
+  } else {
+    ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
+    NumElts = getLoadStoreVectorNumElts(LD);
   }
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  // Build the "promoted" result VTList for the load. If we are really loading
-  // i8s, then the return type will be promoted to i16 since we do not expose
-  // 8-bit registers in NVPTX.
-  const EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
-  SmallVector<EVT, 5> InstVTs;
-  InstVTs.append(NumElts, NodeVT);
-  InstVTs.push_back(MVT::Other);
-  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
-  SDValue Chain = N->getOperand(0);
+  const unsigned FromTypeWidth = TotalWidth / NumElts;
+
+  assert(!(LD->getSimpleValueType(0).isVector() &&
+           ExtensionType != ISD::NON_EXTLOAD));
+  assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+         FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
   SDValue Base, Offset;
-  SelectADDR(Op1, Base, Offset);
-  SDValue Ops[] = {Base, Offset, Chain};
+  SelectADDR(LD->getOperand(1), Base, Offset);
+  SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
+                   Offset, LD->getChain()};
 
+  const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
   std::optional<unsigned> Opcode;
-  switch (N->getOpcode()) {
+  switch (LD->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case ISD::LOAD:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
-        NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
-        NVPTX::INT_PTX_LDG_GLOBAL_i64);
-    break;
-  case ISD::INTRINSIC_W_CHAIN:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
-        NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
-        NVPTX::INT_PTX_LDU_GLOBAL_i64);
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8,
+                             NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32,
+                             NVPTX::LD_GLOBAL_NC_i64);
     break;
   case NVPTXISD::LoadV2:
     Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i64_ELE);
-    break;
-  case NVPTXISD::LDUV2:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i64_ELE);
+        TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16,
+        NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
     break;
   case NVPTXISD::LoadV4:
     Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i64_ELE);
-    break;
-  case NVPTXISD::LDUV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                             NVPTX::INT_PTX_LDU_G_v4i8_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i16_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */});
+        TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16,
+        NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
     break;
   case NVPTXISD::LoadV8:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                             {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
-                             {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
-  SDLoc DL(N);
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
+  SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  // For automatic generation of LDG (through SelectLoad[Vector], not the
-  // intrinsics), we may have an extending load like:
-  //
-  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
-  //
-  // In this case, the matching logic above will select a load for the original
-  // memory type (in this case, i8) and our types will not match (the node needs
-  // to return an i32 in this case). Our LDG/LDU nodes do not support the
-  // concept of sign-/zero-extension, so emulate it here by adding an explicit
-  // CVT instruction. Ptxas should clean up any redundancies here.
-
-  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
-
-  if (OrigType != EltVT &&
-      (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
-    // We have an extending-load. The instruction we selected operates on the
-    // smaller type, but the SDNode we are replacing has the larger type. We
-    // need to emit a CVT to make the types match.
-    unsigned CvtOpc =
-        GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
-
-    // For each output value, apply the manual sign/zero-extension and make sure
-    // all users of the load go through that CVT.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Res(LD, i);
-      SDValue OrigVal(N, i);
-
-      SDNode *CvtNode =
-        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
-                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
-                                                         DL, MVT::i32));
-      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
-    }
+  ReplaceNode(LD, NVPTXLDG);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
+  auto *LD = cast<MemSDNode>(N);
+
+  unsigned NumElts;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::INTRINSIC_W_CHAIN:
+    NumElts = 1;
+    break;
+  case NVPTXISD::LDUV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::LDUV4:
+    NumElts = 4;
+    break;
   }
 
-  ReplaceNode(N, LD);
+  const MVT::SimpleValueType SelectVT =
+      MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
+
+  // If this is an LDU intrinsic, the address is the third operand. If its an
+  // LDU SD node (from custom vector handling), then its the second operand
+  SDValue Addr =
+      LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+
+  SDValue Base, Offset;
+  SelectADDR(Addr, Base, Offset);
+  SDValue Ops[] = {Base, Offset, LD->getChain()};
+
+  std::optional<unsigned> Opcode;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::INTRINSIC_W_CHAIN:
+    Opcode =
+        pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16,
+                        NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
+    break;
+  case NVPTXISD::LDUV2:
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8,
+                             NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32,
+                             NVPTX::LDU_GLOBAL_v2i64);
+    break;
+  case NVPTXISD::LDUV4:
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8,
+                             NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32,
+                             {/* no v4i64 */});
+    break;
+  }
+  if (!Opcode)
+    return false;
+
+  SDLoc DL(N);
+  SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
+
+  ReplaceNode(LD, NVPTXLDU);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   MemSDNode *ST = cast<MemSDNode>(N);
   assert(ST->writeMem() && "Expected store");
-  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
-  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
+  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
   assert((PlainStore || AtomicStore) && "Expected store");
 
   // do not support pre/post inc/dec
   if (PlainStore && PlainStore->isIndexed())
     return false;
 
-  EVT StoreVT = ST->getMemoryVT();
+  const EVT StoreVT = ST->getMemoryVT();
   if (!StoreVT.isSimple())
     return false;
 
   // Address Space Setting
-  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
 
-  SDLoc DL(N);
+  SDLoc DL(ST);
   SDValue Chain = ST->getChain();
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Vector Setting
   const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
@@ -1417,85 +1386,78 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (!NVPTXST)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = ST->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
-  ReplaceNode(N, NVPTXST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
-  SDValue Op1 = N->getOperand(1);
-  EVT EltVT = Op1.getValueType();
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  EVT StoreVT = MemSD->getMemoryVT();
+  MemSDNode *ST = cast<MemSDNode>(N);
+  const EVT StoreVT = ST->getMemoryVT();
   assert(StoreVT.isSimple() && "Store value is not simple");
 
   // Address Space Setting
-  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
   if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
   }
 
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  SDLoc DL(ST);
+  SDValue Chain = ST->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
 
-  unsigned NumElts = getLoadStoreVectorNumElts(N);
-
-  SmallVector<SDValue, 16> Ops(N->ops().slice(1, NumElts));
-  SDValue N2 = N->getOperand(NumElts + 1);
-  unsigned ToTypeWidth = TotalWidth / NumElts;
+  const unsigned NumElts = getLoadStoreVectorNumElts(ST);
 
-  if (isSubVectorPackedInI32(EltVT)) {
-    EltVT = MVT::i32;
-  }
+  SmallVector<SDValue, 16> Ops(ST->ops().slice(1, NumElts));
+  SDValue Addr = N->getOperand(NumElts + 1);
+  const unsigned ToTypeWidth = TotalWidth / NumElts;
 
   assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
          TotalWidth <= 256 && "Invalid width for store");
 
   SDValue Offset, Base;
-  SelectADDR(N2, Base, Offset);
+  SelectADDR(Addr, Base, Offset);
 
   Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
               getI32Imm(CodeAddrSpace, DL),
               getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
               getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
 
+  const MVT::SimpleValueType EltVT =
+      ST->getOperand(1).getSimpleValueType().SimpleTy;
   std::optional<unsigned> Opcode;
-  switch (N->getOpcode()) {
+  switch (ST->getOpcode()) {
   default:
     return false;
   case NVPTXISD::StoreV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
-                             NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
-                             NVPTX::STV_i64_v2);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2,
+                             NVPTX::STV_i32_v2, NVPTX::STV_i64_v2);
     break;
   case NVPTXISD::StoreV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
-                             NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
-                             NVPTX::STV_i64_v4);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4,
+                             NVPTX::STV_i32_v4, NVPTX::STV_i64_v4);
     break;
   case NVPTXISD::StoreV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::STV_i32_v8, {/* no v8i64 */});
     break;
   }
 
   if (!Opcode)
     return false;
 
-  SDNode *ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
+  SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
+  MachineMemOperand *MemRef = ST->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
 
-  ReplaceNode(N, ST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
@@ -2285,70 +2247,6 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
   ReplaceNode(N, Mov);
 }
 
-/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
-/// conversion from \p SrcTy to \p DestTy.
-unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
-                                             LoadSDNode *LdNode) {
-  bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
-  switch (SrcTy.SimpleTy) {
-  default:
-    llvm_unreachable("Unhandled source type");
-  case MVT::i8:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
-    }
-  case MVT::i16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
-    }
-  case MVT::i32:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
-    }
-  case MVT::i64:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
-    }
-  case MVT::f16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::f32:
-      return NVPTX::CVT_f32_f16;
-    case MVT::f64:
-      return NVPTX::CVT_f64_f16;
-    }
-  }
-}
-
 bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   SDLoc DL(N);
   assert(N->getOpcode() == ISD::ATOMIC_FENCE);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 473f4781a6c38..ff58e4486a222 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   void SelectTexSurfHandle(SDNode *N);
   bool tryLoad(SDNode *N);
   bool tryLoadVector(SDNode *N);
-  bool tryLDGLDU(SDNode *N);
+  bool tryLDU(SDNode *N);
+  bool tryLDG(MemSDNode *N);
   bool tryStore(SDNode *N);
   bool tryStoreVector(SDNode *N);
   bool tryLoadParam(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4c3501df57f84..5dbdce52f0553 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
 def hasClusters : Predicate<"Subtarget->hasClusters()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
 def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
 def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
 def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b3c1296cf0ca6..5de3dee1fb344 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR  : ATOM2_bitwise_impl<"xor">;
 
 class LDU_G<string TyStr, NVPTXRegClass regclass>
   :  NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ldu.global." # TyStr # " \t$result, [$src];",
-                      []>, Requires<[hasLDU]>;
+               "ldu.global." # TyStr # " \t$result, [$src];", []>;
 
-def INT_PTX_LDU_GLOBAL_i8  : LDU_G<"b8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
+def LDU_GLOBAL_i8  : LDU_G<"b8",  Int16Regs>;
+def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
 
 // vector
 
@@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
                "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
+def LDU_GLOBAL_v2i8  : VLDU_G_ELE_V2<"b8",  Int16Regs>;
+def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
 
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
+def LDU_GLOBAL_v4i8  : VLDU_G_ELE_V4<"b8",  Int16Regs>;
+def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
 
 
 //-----------------------------------
@@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
 // non-coherent texture cache, and therefore the values read must be read-only
 // during the lifetime of the kernel.
 
-class LDG_G<string TyStr, NVPTXRegClass regclass>
-  : NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ld.global.nc." # TyStr # " \t$result, [$src];",
-                        []>, Requires<[hasLDG]>;
+class LDG_G<NVPTXRegClass regclass>
+  : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+               "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
 
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
+def LD_GLOBAL_NC_i8  : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
+def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
 
 // vector
 
 // Elementized vector ldg
-class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-            (ins ADDR:$src),
-            "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
 
 
-class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 
-            (ins ADDR:$src),
-            "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
-class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
                   regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
-             (ins ADDR:$src),
-             "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+             (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+             "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
-
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
-
-def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
-def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
+def LD_GLOBAL_NC_v2i8  : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
+def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
+
+def LD_GLOBAL_NC_v4i8  : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
+
+def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
+def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
 
 multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
   if Supports32 then
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85d28a703a4cb..ef310e5828f22 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -87,7 +88,7 @@ static cl::opt<bool> EarlyByValArgsCopy(
     cl::desc("Create a copy of byval function arguments early."),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // Register the target.
   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
diff --git a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
index e4f0a517599fc..24fea037b1c54 100644
--- a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheNVPTXTarget32() {
@@ -19,7 +20,8 @@ Target &llvm::getTheNVPTXTarget64() {
   return TheNVPTXTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetInfo() {
   RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx",
                                   "NVIDIA PTX 32-bit", "NVPTX");
   RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64",
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index bb4c2fd3e5cf8..2b3727be644da 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -1784,7 +1785,8 @@ bool PPCAsmParser::parseGNUAttribute(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
   RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget());
   RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target());
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 0c6c17d5a0b68..71a76142bb389 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -50,7 +51,8 @@ static MCDisassembler *createPPCLEDisassembler(const Target &T,
   return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
                                          createPPCDisassembler);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 44b5732be6e3e..dd2756a1a8238 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -473,7 +474,8 @@ static MCInstrAnalysis *createPPCMCInstrAnalysis(const MCInstrInfo *Info) {
   return new PPCMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetMC() {
   for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
                     &getThePPC64Target(), &getThePPC64LETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d5d51e3ca6386..9e42011c0c746 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -63,6 +63,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -3374,7 +3375,8 @@ INITIALIZE_PASS(PPCAIXAsmPrinter, "ppc-aix-asm-printer",
                 "AIX PPC Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
                                      createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(),
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 47bb20f4aa073..d0a5be8b2e23a 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -117,7 +117,7 @@ class PPCBoolRetToInt : public FunctionPass {
 
   // A PHINode is Promotable if:
   // 1. Its type is i1 AND
-  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // 2. All of its uses are ReturnInt, CallInst, or PHINode
   // AND
   // 3. All of its operands are Constant or Argument or
   //    CallInst or PHINode AND
@@ -136,8 +136,7 @@ class PPCBoolRetToInt : public FunctionPass {
     for (const PHINode *P : Promotable) {
       // Condition 2 and 3
       auto IsValidUser = [] (const Value *V) -> bool {
-        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
-        isa<DbgInfoIntrinsic>(V);
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V);
       };
       auto IsValidOperand = [] (const Value *V) -> bool {
         return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 359a43dd001d2..b5c6ac111dff0 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -113,7 +114,8 @@ static cl::opt<unsigned>
                          cl::init(0x7fff),
                          cl::desc("Maximum global merge offset"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
   RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget());
diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 0bfa0bd5ec0e7..982be2746b47b 100644
--- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getThePPC32Target() {
@@ -27,7 +28,8 @@ Target &llvm::getThePPC64LETarget() {
   return ThePPC64LETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetInfo() {
   RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32",
                                                  "PowerPC 32", "PPC");
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 510ca5f8c0d92..f1d6f99ba9815 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributes.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
@@ -4021,7 +4022,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmParser() {
   RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
   RegisterMCAsmParser<RISCVAsmParser> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 4363e5c5176c9..27e04c0cb1f8b 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -64,7 +65,8 @@ static MCDisassembler *createRISCVDisassembler(const Target &T,
   return new RISCVDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheRISCV32Target(),
                                          createRISCVDisassembler);
@@ -772,8 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = {
     RISCV::FeatureVendorXTHeadVdot};
 
 static constexpr FeatureBitset XAndesGroup = {
-    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVPackFPH,
-    RISCV::FeatureVendorXAndesVDot};
+    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt,
+    RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot};
 
 static constexpr DecoderListEntry DecoderList32[]{
     // Vendor Extensions
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index e3b89d84a134b..ae44306170758 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "RISCV.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca-riscv-custombehaviour"
@@ -344,7 +345,8 @@ createRISCVInstrumentManager(const MCSubtargetInfo &STI,
 }
 
 /// Extern function to initialize the targets for the RISC-V backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMCA() {
   TargetRegistry::RegisterInstrumentManager(getTheRISCV32Target(),
                                             createRISCVInstrumentManager);
   TargetRegistry::RegisterInstrumentManager(getTheRISCV64Target(),
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 2f37c351baf9f..9161f23c8a954 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -85,7 +85,7 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_qc_e_branch", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_riscv_qc_e_32", 16, 32, 0},
       {"fixup_riscv_qc_abs20_u", 12, 20, 0},
-      {"fixup_riscv_qc_e_jump_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_qc_e_call_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
   };
   static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds,
                 "Not all fixup kinds added to Infos array");
@@ -552,7 +552,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = (Bit19 << 31) | (Bit14_0 << 16) | (Bit18_15 << 12);
     return Value;
   }
-  case RISCV::fixup_riscv_qc_e_jump_plt: {
+  case RISCV::fixup_riscv_qc_e_call_plt: {
     if (!isInt<32>(Value))
       Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     if (Value & 0x1)
@@ -699,7 +699,7 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   case RISCV::fixup_riscv_qc_e_branch:
   case RISCV::fixup_riscv_qc_abs20_u:
   case RISCV::fixup_riscv_qc_e_32:
-  case RISCV::fixup_riscv_qc_e_jump_plt:
+  case RISCV::fixup_riscv_qc_e_call_plt:
     VendorIdentifier = "QUALCOMM";
     break;
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 1d81096d6b600..3c1f9450a0991 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -101,8 +101,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_RISCV_CALL_PLT;
     case RISCV::fixup_riscv_qc_e_branch:
       return ELF::R_RISCV_QC_E_BRANCH;
-    case RISCV::fixup_riscv_qc_e_jump_plt:
-      return ELF::R_RISCV_QC_E_JUMP_PLT;
+    case RISCV::fixup_riscv_qc_e_call_plt:
+      return ELF::R_RISCV_QC_E_CALL_PLT;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 80fbed8d10f99..8d869a64cde47 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -54,7 +54,7 @@ enum Fixups {
   // 20-bit fixup for symbol references in the 32-bit qc.li instruction
   fixup_riscv_qc_abs20_u,
   // 32-bit fixup for symbol references in the 48-bit qc.j/qc.jal instructions
-  fixup_riscv_qc_e_jump_plt,
+  fixup_riscv_qc_e_call_plt,
 
   // Used as a sentinel, must be the last
   fixup_riscv_invalid,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 1185e3558b002..2a90552037f91 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -645,7 +645,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
     } else if (MIFrm == RISCVII::InstFormatQC_EJ) {
-      FixupKind = RISCV::fixup_riscv_qc_e_jump_plt;
+      FixupKind = RISCV::fixup_riscv_qc_e_call_plt;
       RelaxCandidate = true;
     }
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index f3b93f032588c..f66c2d5f99cb3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <bitset>
 
@@ -331,7 +332,8 @@ static MCInstrAnalysis *createRISCVInstrAnalysis(const MCInstrInfo *Info) {
   return new RISCVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCObjectFileInfo(*T, createRISCVMCObjectFileInfo);
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 83e9b4b4d7c5c..d4d7de289a107 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -38,6 +38,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
@@ -610,7 +611,8 @@ void RISCVAsmPrinter::emitFunctionEntryLabel() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmPrinter() {
   RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
   RegisterAsmPrinter<RISCVAsmPrinter> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 940caa4f40444..6df6368929dac 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1426,7 +1426,8 @@ def HasVendorXwchc
 // Qualcomm Extensions
 
 def FeatureVendorXqccmp
-    : RISCVExperimentalExtension<0, 1, "Qualcomm 16-bit Push/Pop and Double Moves",
+    : RISCVExperimentalExtension<0, 3,
+                                 "Qualcomm 16-bit Push/Pop and Double Moves",
                                  [FeatureStdExtZca]>;
 def HasVendorXqccmp
     : Predicate<"Subtarget->hasVendorXqccmp()">,
@@ -1488,14 +1489,14 @@ def HasVendorXqcics
                          "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
 
 def FeatureVendorXqcicsr
-    : RISCVExperimentalExtension<0, 3, "Qualcomm uC CSR Extension">;
+    : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
 def HasVendorXqcicsr
     : Predicate<"Subtarget->hasVendorXqcicsr()">,
       AssemblerPredicate<(all_of FeatureVendorXqcicsr),
                          "'Xqcicsr' (Qualcomm uC CSR Extension)">;
 
 def FeatureVendorXqciint
-    : RISCVExperimentalExtension<0, 7, "Qualcomm uC Interrupts Extension",
+    : RISCVExperimentalExtension<0, 10, "Qualcomm uC Interrupts Extension",
                                  [FeatureStdExtZca]>;
 def HasVendorXqciint
     : Predicate<"Subtarget->hasVendorXqciint()">,
@@ -1542,7 +1543,7 @@ def HasVendorXqcilo
                          "'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)">;
 
 def FeatureVendorXqcilsm
-    : RISCVExperimentalExtension<0, 5,
+    : RISCVExperimentalExtension<0, 6,
                                  "Qualcomm uC Load Store Multiple Extension">;
 def HasVendorXqcilsm
     : Predicate<"Subtarget->hasVendorXqcilsm()">,
@@ -1598,6 +1599,15 @@ def HasVendorXAndesPerf
       AssemblerPredicate<(all_of FeatureVendorXAndesPerf),
                          "'XAndesPerf' (Andes Performance Extension)">;
 
+def FeatureVendorXAndesVBFHCvt
+    : RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXAndesVBFHCvt
+    : Predicate<"Subtarget->hasVendorXAndesVBFHCvt()">,
+      AssemblerPredicate<(all_of FeatureVendorXAndesVBFHCvt),
+                         "'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)">;
+
+
 def FeatureVendorXAndesVPackFPH
     : RISCVExtension<5, 0, "Andes Vector Packed FP16 Extension",
                      [FeatureStdExtZvfhmin]>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601c..e670567bd1844 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3498,14 +3498,6 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
 }
 
-static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
-         "Unexpected vector MVT");
-  return MVT::getScalableVectorVT(
-      VT.getVectorElementType(),
-      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
-}
-
 struct VIDSequence {
   int64_t StepNumerator;
   unsigned StepDenominator;
@@ -4316,7 +4308,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
     MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
     MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-    assert(M1VT == getLMUL1VT(M1VT));
+    assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
 
     // The following semantically builds up a fixed length concat_vector
     // of the component build_vectors.  We eagerly lower to scalable and
@@ -4356,7 +4348,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
   unsigned NumDefElts = NumElts - NumUndefElts;
   if (NumDefElts >= 8 && NumDefElts > NumElts / 2 &&
-      ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) {
+      ContainerVT.bitsLE(RISCVTargetLowering::getM1VT(ContainerVT))) {
     SmallVector<SDValue> SubVecAOps, SubVecBOps;
     SmallVector<SDValue> MaskVals;
     SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0));
@@ -5114,7 +5106,8 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
 
   MVT InnerVT = ContainerVT;
   auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget);
-  if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (Op1.isUndef() &&
+      ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       (RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) {
     InnerVT = ContainerVT.getHalfNumVectorElementsVT();
     VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL,
@@ -5382,7 +5375,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-  assert(M1VT == getLMUL1VT(M1VT));
+  assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
   unsigned NumElts = ContainerVT.getVectorMinNumElements();
   unsigned NumOfSrcRegs = NumElts / NumOpElts;
@@ -6152,7 +6145,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       return convertFromScalableVector(VT, Gather, DAG, Subtarget);
     }
 
-    const MVT M1VT = getLMUL1VT(ContainerVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
     auto [InnerTrueMask, InnerVL] =
         getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
@@ -7360,20 +7353,25 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     uint64_t Val = Op.getConstantOperandVal(0);
     if (isPowerOf2_64(Val)) {
       uint64_t Log2 = Log2_64(Val);
-      if (Log2 < 3)
+      if (Log2 < 3) {
+        SDNodeFlags Flags;
+        Flags.setExact(true);
         Res = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                          DAG.getConstant(3 - Log2, DL, VT));
-      else if (Log2 > 3)
+                          DAG.getConstant(3 - Log2, DL, XLenVT), Flags);
+      } else if (Log2 > 3) {
         Res = DAG.getNode(ISD::SHL, DL, XLenVT, Res,
                           DAG.getConstant(Log2 - 3, DL, XLenVT));
+      }
     } else if ((Val % 8) == 0) {
       // If the multiplier is a multiple of 8, scale it down to avoid needing
       // to shift the VLENB value.
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, Res,
                         DAG.getConstant(Val / 8, DL, XLenVT));
     } else {
+      SDNodeFlags Flags;
+      Flags.setExact(true);
       SDValue VScale = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                                   DAG.getConstant(3, DL, XLenVT));
+                                   DAG.getConstant(3, DL, XLenVT), Flags);
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, VScale,
                         DAG.getConstant(Val, DL, XLenVT));
     }
@@ -7801,7 +7799,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // This reduces the length of the chain of vslideups and allows us to
     // perform the vslideups at a smaller LMUL, limited to MF2.
     if (Op.getNumOperands() > 2 &&
-        ContainerVT.bitsGE(getLMUL1VT(ContainerVT))) {
+        ContainerVT.bitsGE(RISCVTargetLowering::getM1VT(ContainerVT))) {
       MVT HalfVT = VT.getHalfNumVectorElementsVT();
       assert(isPowerOf2_32(Op.getNumOperands()));
       size_t HalfNumOps = Op.getNumOperands() / 2;
@@ -9821,11 +9819,12 @@ getSmallestVTForIndex(MVT VecVT, unsigned MaxIdx, SDLoc DL, SelectionDAG &DAG,
   const unsigned MinVLMAX = VectorBitsMin / EltSize;
   MVT SmallerVT;
   if (MaxIdx < MinVLMAX)
-    SmallerVT = getLMUL1VT(VecVT);
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT);
   else if (MaxIdx < MinVLMAX * 2)
-    SmallerVT = getLMUL1VT(VecVT).getDoubleNumVectorElementsVT();
+    SmallerVT =
+        RISCVTargetLowering::getM1VT(VecVT).getDoubleNumVectorElementsVT();
   else if (MaxIdx < MinVLMAX * 4)
-    SmallerVT = getLMUL1VT(VecVT)
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT)
                     .getDoubleNumVectorElementsVT()
                     .getDoubleNumVectorElementsVT();
   if (!SmallerVT.isValid() || !VecVT.bitsGT(SmallerVT))
@@ -9898,9 +9897,8 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     // If we're compiling for an exact VLEN value, we can always perform
     // the insert in m1 as we can determine the register corresponding to
     // the index in the register group.
-    const MVT M1VT = getLMUL1VT(ContainerVT);
-    if (auto VLEN = Subtarget.getRealVLen();
-        VLEN && ContainerVT.bitsGT(M1VT)) {
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
+    if (auto VLEN = Subtarget.getRealVLen(); VLEN && ContainerVT.bitsGT(M1VT)) {
       EVT ElemVT = VecVT.getVectorElementType();
       unsigned ElemsPerVReg = *VLEN / ElemVT.getFixedSizeInBits();
       unsigned RemIdx = OrigIdx % ElemsPerVReg;
@@ -10127,7 +10125,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   const auto VLen = Subtarget.getRealVLen();
   if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
       IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
-    MVT M1VT = getLMUL1VT(ContainerVT);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     unsigned OrigIdx = IdxC->getZExtValue();
     EVT ElemVT = VecVT.getVectorElementType();
     unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
@@ -10175,7 +10173,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   // TODO: We don't have the same code for insert_vector_elt because we
   // have BUILD_VECTOR and handle the degenerate case there.  Should we
   // consider adding an inverse BUILD_VECTOR node?
-  MVT LMUL2VT = getLMUL1VT(ContainerVT).getDoubleNumVectorElementsVT();
+  MVT LMUL2VT =
+      RISCVTargetLowering::getM1VT(ContainerVT).getDoubleNumVectorElementsVT();
   if (ContainerVT.bitsGT(LMUL2VT) && VecVT.isFixedLengthVector())
     return SDValue();
 
@@ -11107,7 +11106,7 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
                                  SDValue VL, const SDLoc &DL, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   const MVT VecVT = Vec.getSimpleValueType();
-  const MVT M1VT = getLMUL1VT(VecVT);
+  const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
   const MVT XLenVT = Subtarget.getXLenVT();
   const bool NonZeroAVL = isNonZeroAVL(VL);
 
@@ -11485,8 +11484,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     assert(VLen);
     AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
   }
-  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
-    InterSubVT = getLMUL1VT(ContainerVecVT);
+  if (ContainerVecVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVecVT))) {
+    InterSubVT = RISCVTargetLowering::getM1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getExtractSubvector(DL, InterSubVT, Vec, AlignedIdx);
@@ -11677,7 +11676,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
   MVT InterSubVT = VecVT;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
+  if (VecVT.bitsGT(RISCVTargetLowering::getM1VT(VecVT))) {
     // If VecVT has an LMUL > 1, then SubVecVT should have a smaller LMUL, and
     // we should have successfully decomposed the extract into a subregister.
     // We use an extract_subvector that will resolve to a subreg extract.
@@ -11688,7 +11687,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
       assert(VLen);
       Idx /= *VLen / RISCV::RVVBitsPerBlock;
     }
-    InterSubVT = getLMUL1VT(VecVT);
+    InterSubVT = RISCVTargetLowering::getM1VT(VecVT);
     Vec = DAG.getExtractSubvector(DL, InterSubVT, Vec, Idx);
   }
 
@@ -11805,7 +11804,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
     // For fractional LMUL, check if we can use a higher LMUL
     // instruction to avoid a vslidedown.
     if (SDValue Src = foldConcatVector(V1, V2);
-        Src && getLMUL1VT(VT).bitsGT(VT)) {
+        Src && RISCVTargetLowering::getM1VT(VT).bitsGT(VT)) {
       EVT NewVT = VT.getDoubleNumVectorElementsVT();
       Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
       // Freeze the source so we can increase its use count.
@@ -12187,7 +12186,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // vrgather.vv v14, v9, v16
   // vrgather.vv v13, v10, v16
   // vrgather.vv v12, v11, v16
-  if (ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       ContainerVT.getVectorElementCount().isKnownMultipleOf(2)) {
     auto [Lo, Hi] = DAG.SplitVector(Vec, DL);
     Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, Lo.getSimpleValueType(), Lo);
@@ -12252,7 +12251,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // At LMUL > 1, do the index computation in 16 bits to reduce register
   // pressure.
   if (IntVT.getScalarType().bitsGT(MVT::i16) &&
-      IntVT.bitsGT(getLMUL1VT(IntVT))) {
+      IntVT.bitsGT(RISCVTargetLowering::getM1VT(IntVT))) {
     assert(isUInt<16>(MaxVLMAX - 1)); // Largest VLMAX is 65536 @ zvl65536b
     GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
     IntVT = IntVT.changeVectorElementType(MVT::i16);
@@ -12339,7 +12338,7 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Load->getMemOperand();
     SDValue NewLoad =
         DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
@@ -12400,7 +12399,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Store->getMemOperand();
     return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
                         MMO->getPointerInfo(), MMO->getBaseAlign(),
@@ -20368,7 +20367,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return Scalar.getOperand(0);
 
     // Use M1 or smaller to avoid over constraining register allocation
-    const MVT M1VT = getLMUL1VT(VT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VT);
     if (M1VT.bitsLT(VT)) {
       SDValue M1Passthru = DAG.getExtractSubvector(DL, M1VT, Passthru, 0);
       SDValue Result =
@@ -20382,7 +20381,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // no purpose.
     if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Scalar);
         Const && !Const->isZero() && isInt<5>(Const->getSExtValue()) &&
-        VT.bitsLE(getLMUL1VT(VT)) && Passthru.isUndef())
+        VT.bitsLE(RISCVTargetLowering::getM1VT(VT)) && Passthru.isUndef())
       return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
 
     break;
@@ -20390,7 +20389,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::VMV_X_S: {
     SDValue Vec = N->getOperand(0);
     MVT VecVT = N->getOperand(0).getSimpleValueType();
-    const MVT M1VT = getLMUL1VT(VecVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
     if (M1VT.bitsLT(VecVT)) {
       Vec = DAG.getExtractSubvector(DL, M1VT, Vec, 0);
       return DAG.getNode(RISCVISD::VMV_X_S, DL, N->getSimpleValueType(0), Vec);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a62382..f67d7f155c9d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -363,6 +363,15 @@ class RISCVTargetLowering : public TargetLowering {
   static std::pair<unsigned, unsigned>
   computeVLMAXBounds(MVT ContainerVT, const RISCVSubtarget &Subtarget);
 
+  /// Given a vector (either fixed or scalable), return the scalable vector
+  /// corresponding to a vector register (i.e. an m1 register group).
+  static MVT getM1VT(MVT VT) {
+    unsigned EltSizeInBits = VT.getVectorElementType().getSizeInBits();
+    assert(EltSizeInBits <= RISCV::RVVBitsPerBlock && "Unexpected vector MVT");
+    return MVT::getScalableVectorVT(VT.getVectorElementType(),
+                                    RISCV::RVVBitsPerBlock / EltSizeInBits);
+  }
+
   static unsigned getRegClassIDForLMUL(RISCVVType::VLMUL LMul);
   static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
   static unsigned getRegClassIDForVecVT(MVT VT);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 4cf8309ea17f4..3ba21e51e7c66 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -361,6 +361,25 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
   let RVVConstraint = VMConstraint;
 }
 
+class NDSRVInstVBFHCvt<bits<7> funct7, bits<5> vs1, string opcodestr>
+    : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm),
+             opcodestr, "$vd, $vs2", [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> vd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14-12} = 0b100;
+  let Inst{11-7} = vd;
+  let Inst{6-0} = OPC_CUSTOM_2.Value;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+
+  let Uses = [VL, VTYPE];
+}
+
 //===----------------------------------------------------------------------===//
 // Multiclass
 //===----------------------------------------------------------------------===//
@@ -460,6 +479,18 @@ def NDS_LDGP  : NDSRVInstLDGP<0b011, "nds.ldgp">;
 def NDS_SDGP  : NDSRVInstSDGP<0b111, "nds.sdgp">;
 } // Predicates = [HasVendorXAndesPerf, IsRV64]
 
+//===----------------------------------------------------------------------===//
+// XAndesVBFHCvt
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXAndesVBFHCvt], Constraints = "@earlyclobber $vd",
+    mayRaiseFPException = true in {
+let RVVConstraint = VS2Constraint, DestEEW = EEWSEWx2 in
+def NDS_VFWCVT_S_BF16 : NDSRVInstVBFHCvt<0b0000000, 0b00000, "nds.vfwcvt.s.bf16">;
+let Uses = [FRM, VL, VTYPE] in
+def NDS_VFNCVT_BF16_S : NDSRVInstVBFHCvt<0b0000000, 0b00001, "nds.vfncvt.bf16.s">;
+}
+
 //===----------------------------------------------------------------------===//
 // XAndesVPackFPH
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index b94fee3c6e575..09852c6fd5969 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1452,6 +1452,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
 def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
+let Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32] in {
+def: Pat<(i32 (cttz (not (i32 GPR:$rs1)))), (QC_CTO GPR:$rs1)>;
+def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
+} // Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32]
+
 let Predicates = [HasVendorXqciint, IsRV32] in
 def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index d7e6c71ea062e..32f4ab607a34c 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,6 +703,8 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
+                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -716,6 +718,8 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
+                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -737,6 +741,7 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -751,6 +756,7 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -765,6 +771,7 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -779,5 +786,6 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 8a47453cedcd3..b43b915d0ad4f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
@@ -118,7 +119,7 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for RISC-V"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
@@ -570,6 +571,10 @@ void RISCVPassConfig::addPreEmitPass() {
     addPass(createMachineCopyPropagationPass(true));
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(createRISCVLateBranchOptPass());
+  // The IndirectBranchTrackingPass inserts lpad and could have changed the
+  // basic block alignment. It must be done before Branch Relaxation to
+  // prevent the adjusted offset exceeding the branch range.
+  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(&BranchRelaxationPassID);
   addPass(createRISCVMakeCompressibleOptPass());
 }
@@ -581,7 +586,6 @@ void RISCVPassConfig::addPreEmitPass2() {
     // ensuring return instruction is detected correctly.
     addPass(createRISCVPushPopOptimizationPass());
   }
-  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(createRISCVExpandPseudoPass());
 
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bee47527cf428..aadda2ce85529 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -840,33 +840,64 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
   }
   case TTI::SK_Reverse: {
+
+    if (!LT.second.isVector())
+      return InstructionCost::getInvalid();
+
     // TODO: Cases to improve here:
     // * Illegal vector types
     // * i64 on RV32
-    // * i1 vector
-    // At low LMUL, most of the cost is producing the vrgather index register.
-    // At high LMUL, the cost of the vrgather itself will dominate.
-    // Example sequence:
-    //   csrr a0, vlenb
-    //   srli a0, a0, 3
-    //   addi a0, a0, -1
-    //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
-    //   vid.v v9
-    //   vrsub.vx v10, v9, a0
-    //   vrgather.vv v9, v8, v10
-    InstructionCost LenCost = 3;
+    if (Tp->getElementType()->isIntegerTy(1)) {
+      VectorType *WideTy =
+          VectorType::get(IntegerType::get(Tp->getContext(), 8),
+                          cast<VectorType>(Tp)->getElementCount());
+      return getCastInstrCost(Instruction::ZExt, WideTy, Tp,
+                              TTI::CastContextHint::None, CostKind) +
+             getShuffleCost(TTI::SK_Reverse, WideTy, {}, CostKind, 0, nullptr) +
+             getCastInstrCost(Instruction::Trunc, Tp, WideTy,
+                              TTI::CastContextHint::None, CostKind);
+    }
+
+    MVT ContainerVT = LT.second;
     if (LT.second.isFixedLengthVector())
-      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
-      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
-    if (LT.second.isFixedLengthVector() &&
-        isInt<5>(LT.second.getVectorNumElements() - 1))
-      Opcodes[1] = RISCV::VRSUB_VI;
+      ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
+    if (ContainerVT.bitsLE(M1VT)) {
+      // Example sequence:
+      //   csrr a0, vlenb
+      //   srli a0, a0, 3
+      //   addi a0, a0, -1
+      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
+      //   vid.v v9
+      //   vrsub.vx v10, v9, a0
+      //   vrgather.vv v9, v8, v10
+      InstructionCost LenCost = 3;
+      if (LT.second.isFixedLengthVector())
+        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
+        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
+      unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
+      if (LT.second.isFixedLengthVector() &&
+          isInt<5>(LT.second.getVectorNumElements() - 1))
+        Opcodes[1] = RISCV::VRSUB_VI;
+      InstructionCost GatherCost =
+          getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+      return LT.first * (LenCost + GatherCost);
+    }
+
+    // At high LMUL, we split into a series of M1 reverses (see
+    // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
+    // the resulting gap at the bottom (for fixed vectors only).  The important
+    // bit is that the cost scales linearly, not quadratically with LMUL.
+    unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
+    InstructionCost FixedCost =
+        getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
+    unsigned Ratio =
+        ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();
     InstructionCost GatherCost =
-        getRISCVInstructionCost(Opcodes, LT.second, CostKind);
-    // Mask operation additionally required extend and truncate
-    InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
-    return LT.first * (LenCost + GatherCost + ExtendCost);
+        getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
+    InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
+      getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
+    return FixedCost + LT.first * (GatherCost + SlideCost);
   }
   }
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
diff --git a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
index 0a675d6849122..fc0965d263a8a 100644
--- a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
+++ b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheRISCV32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheRISCV64Target() {
   return TheRISCV64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetInfo() {
   RegisterTarget<Triple::riscv32, /*HasJIT=*/true> X(
       getTheRISCV32Target(), "riscv32", "32-bit RISC-V", "RISCV");
   RegisterTarget<Triple::riscv64, /*HasJIT=*/true> Y(
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 21a952649ff51..cc77ddd748a94 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -78,7 +79,8 @@ static MCInstrAnalysis *createSPIRVInstrAnalysis(const MCInstrInfo *Info) {
   return new SPIRVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetMC() {
   for (Target *T : {&getTheSPIRV32Target(), &getTheSPIRV64Target(),
                     &getTheSPIRVLogicalTarget()}) {
     RegisterMCAsmInfo<SPIRVMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 26b94788b810e..1ebfde2a603b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -657,7 +658,8 @@ INITIALIZE_PASS(SPIRVAsmPrinter, "spirv-asm-printer", "SPIRV Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVAsmPrinter() {
   RegisterAsmPrinter<SPIRVAsmPrinter> X(getTheSPIRV32Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Y(getTheSPIRV64Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Z(getTheSPIRVLogicalTarget());
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6842e5ff067cf..401a762cd62a3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -674,6 +674,8 @@ defm : DemangledNativeBuiltin<"ndrange_3D", OpenCL_std, Enqueue, 1, 3, OpBuildND
 
 // Spec constant builtin records:
 defm : DemangledNativeBuiltin<"__spirv_SpecConstant", OpenCL_std, SpecConstant, 2, 2, OpSpecConstant>;
+defm : DemangledNativeBuiltin<"__spirv_SpecConstant", GLSL_std_450,
+                              SpecConstant, 2, 2, OpSpecConstant>;
 defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecConstant, 1, 0, OpSpecConstantComposite>;
 
 // Async Copy and Prefetch builtin records:
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 36cc5cbe655bc..a412887e51adb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -480,7 +480,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
-             F.getLinkage() != GlobalValue::PrivateLinkage) {
+             F.getLinkage() != GlobalValue::PrivateLinkage &&
+             F.getVisibility() != GlobalValue::HiddenVisibility) {
     SPIRV::LinkageType::LinkageType LnkTy =
         F.isDeclaration()
             ? SPIRV::LinkageType::Import
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 911a6966aaef0..851e0c6b81fcf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3898,7 +3898,8 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage();
+  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() &&
+                  !GV->hasHiddenVisibility();
   SPIRV::LinkageType::LinkageType LnkType =
       GV->isDeclarationForLinker()
           ? SPIRV::LinkageType::Import
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 82fe23a22b60f..d7cf211ba84dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
@@ -35,7 +36,7 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   // Register the target.
   RegisterTargetMachine<SPIRVTargetMachine> X(getTheSPIRV32Target());
   RegisterTargetMachine<SPIRVTargetMachine> Y(getTheSPIRV64Target());
diff --git a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
index febefc0249204..c4d086d7da5c9 100644
--- a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
+++ b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SPIRVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheSPIRVLogicalTarget() {
   return TheSPIRVLogicalTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetInfo() {
   RegisterTarget<Triple::spirv32> X(getTheSPIRV32Target(), "spirv32",
                                     "SPIR-V 32-bit", "SPIRV");
   RegisterTarget<Triple::spirv64> Y(getTheSPIRV64Target(), "spirv64",
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 90aacacd8ed2d..f1009999dc1b7 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -109,7 +110,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   ParseStatus parseExpression(int64_t &Val);
 
   // Helper function for dealing with %lo / %hi in PIC mode.
-  const SparcMCExpr *adjustPICRelocation(uint16_t VK, const MCExpr *subExpr);
+  const MCSpecifierExpr *adjustPICRelocation(uint16_t VK,
+                                             const MCExpr *subExpr);
 
   // Helper function to see if current token can start an expression.
   bool isPossibleExpression(const AsmToken &Token);
@@ -1642,7 +1644,7 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
 static bool hasGOTReference(const MCExpr *Expr) {
   switch (Expr->getKind()) {
   case MCExpr::Target:
-    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+    if (const MCSpecifierExpr *SE = dyn_cast<MCSpecifierExpr>(Expr))
       return hasGOTReference(SE->getSubExpr());
     break;
 
@@ -1668,8 +1670,8 @@ static bool hasGOTReference(const MCExpr *Expr) {
   return false;
 }
 
-const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
-                                                       const MCExpr *subExpr) {
+const MCSpecifierExpr *
+SparcAsmParser::adjustPICRelocation(uint16_t RelType, const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSET_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
@@ -1749,7 +1751,8 @@ bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) {
   }
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmParser() {
   RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
   RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
   RegisterMCAsmParser<SparcAsmParser> C(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 173fe3df0d95a..fab94fb4d40ca 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -46,8 +47,8 @@ static MCDisassembler *createSparcDisassembler(const Target &T,
   return new SparcDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSparcTarget(),
                                          createSparcDisassembler);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index bef7f3c02dae3..2a581d381d4ab 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -72,7 +72,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
   if (mc::isRelocation(Fixup.getKind()))
     return Kind;
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+  if (const auto *SExpr = dyn_cast<MCSpecifierExpr>(Fixup.getValue())) {
     if (SExpr->getSpecifier() == ELF::R_SPARC_DISP32)
       return ELF::R_SPARC_DISP32;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 800567bf58ffa..36365593e2460 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcMCAsmInfo.h"
-#include "SparcMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 7ea800f119174..a4a2fa3f9933e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -36,6 +36,11 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
                           const MCSpecifierExpr &Expr) const override;
 };
 
+namespace Sparc {
+uint16_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint16_t S);
+} // namespace Sparc
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 4ce9bea5d7958..8ba99719946a2 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "SparcMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -134,7 +134,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 
   assert(MO.isExpr());
   const MCExpr *Expr = MO.getExpr();
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -164,7 +164,7 @@ unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -190,7 +190,7 @@ SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 6d43b93713906..1ee6e80985605 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
deleted file mode 100644
index 8e7c173c70ccb..0000000000000
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes Sparc-specific MCExprs, used for modifiers like
-// "%hi" or "%lo" etc.,
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-
-#include "SparcFixupKinds.h"
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class StringRef;
-using SparcMCExpr = MCSpecifierExpr;
-
-namespace Sparc {
-uint16_t parseSpecifier(StringRef name);
-StringRef getSpecifierName(uint16_t S);
-} // namespace Sparc
-
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 72f9b3bcd9681..fa07578e512b5 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
@@ -107,7 +108,8 @@ static MCInstPrinter *createSparcMCInstPrinter(const Triple &T,
   return new SparcInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheSparcTarget(), createSparcMCAsmInfo);
   RegisterMCAsmInfoFn Y(getTheSparcV9Target(), createSparcV9MCAsmInfo);
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index f4201f9a8dc1a..8e7e2e5f73709 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcInstPrinter.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "MCTargetDesc/SparcTargetStreamer.h"
 #include "Sparc.h"
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -82,7 +83,7 @@ class SparcAsmPrinter : public AsmPrinter {
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
+  auto *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +102,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
+  auto *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
   return MCOperand::createExpr(expr);
 }
 
@@ -506,7 +507,8 @@ INITIALIZE_PASS(SparcAsmPrinter, "sparc-asm-printer", "Sparc Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
   RegisterAsmPrinter<SparcAsmPrinter> Y(getTheSparcV9Target());
   RegisterAsmPrinter<SparcAsmPrinter> Z(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index a6ea079746095..21ecf3d5ed70e 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcISelLowering.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index d4d8cbb044dec..52076a6b4dd22 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
   // Register the target.
   RegisterTargetMachine<SparcV8TargetMachine> X(getTheSparcTarget());
   RegisterTargetMachine<SparcV9TargetMachine> Y(getTheSparcV9Target());
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index a42a67d91d848..711bf9b31a377 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetObjectFile.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 1f8837eb01949..2bfcffbd4fd0b 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheSparcTarget() {
@@ -23,7 +24,8 @@ Target &llvm::getTheSparcelTarget() {
   return TheSparcelTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetInfo() {
   RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(),
                                                     "sparc", "Sparc", "Sparc");
   RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y(
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 6ee2a87565baa..04a4c36109246 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -1784,6 +1785,7 @@ bool SystemZAsmParser::isLabel(AsmToken &Token) {
 
 // Force static initialization.
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmParser() {
   RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index bd188f5b4b520..6ae529e974186 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -45,7 +46,8 @@ static MCDisassembler *createSystemZDisassembler(const Target &T,
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSystemZTarget(),
                                          createSystemZDisassembler);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index d2ed5cac5c576..86e340b7ff1bd 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -239,7 +240,8 @@ static MCInstrAnalysis *createSystemZMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetMC() {
   // Register the MCAsmInfo.
   TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
                                     createSystemZMCAsmInfo);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index aaf12b88de132..6f9d25c050b71 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Chrono.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ConvertEBCDIC.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -1738,6 +1739,7 @@ INITIALIZE_PASS(SystemZAsmPrinter, "systemz-asm-printer",
                 "SystemZ Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index ddb5a730a6fd3..ece8928accd0c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <memory>
@@ -36,7 +37,8 @@ static cl::opt<bool> EnableMachineCombinerPass(
     cl::init(true), cl::Hidden);
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
   auto &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 91e4c91b00b9d..703051f6f2d3c 100644
--- a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SystemZTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -17,7 +18,8 @@ Target &llvm::getTheSystemZTarget() {
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetInfo() {
   RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(
       getTheSystemZTarget(), "systemz", "SystemZ", "SystemZ");
 }
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index c54ce40de45ff..7987950a2a0aa 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -1510,7 +1511,7 @@ ParseStatus VEAsmParser::parseVEAsmOperand(std::unique_ptr<VEOperand> &Op) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
   RegisterMCAsmParser<VEAsmParser> A(getTheVETarget());
 }
 
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 00487a1f5bb38..88200c5fc97eb 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -47,7 +48,8 @@ static MCDisassembler *createVEDisassembler(const Target &T,
   return new VEDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeVEDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
                                          createVEDisassembler);
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index 019748413d32e..699ef9808eb88 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -84,7 +85,7 @@ static MCInstPrinter *createVEMCInstPrinter(const Triple &T,
   return new VEInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheVETarget(), createVEMCAsmInfo);
 
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index 7c4bf1cfd672e..dcc54b4cec01a 100644
--- a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/VETargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,7 @@ Target &llvm::getTheVETarget() {
   return TheVETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
   RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
                                                  "VE", "VE");
 }
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index af0dc0404d3cc..f7d770c18f883 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -419,6 +420,6 @@ INITIALIZE_PASS(VEAsmPrinter, "ve-asm-printer", "VE Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
   RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
 }
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 664a54cea7c52..14b8e330d87a4 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -19,13 +19,14 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ve"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   // Register the target.
   RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 9649381f07b14..e4140755edf4e 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace llvm;
@@ -1282,7 +1283,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 } // end anonymous namespace
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmParser() {
   RegisterMCAsmParser<WebAssemblyAsmParser> X(getTheWebAssemblyTarget32());
   RegisterMCAsmParser<WebAssemblyAsmParser> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0399f9d38e4eb..8a29a5902ce22 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 
@@ -66,7 +67,7 @@ static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
   return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeWebAssemblyDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget32(),
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index a4162a07ee33f..6c0031f429c6d 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -124,7 +125,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T :
        {&getTheWebAssemblyTarget32(), &getTheWebAssemblyTarget64()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index ef2c77ade8cc5..e65fa8e60aeb1 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-target-info"
@@ -26,7 +27,8 @@ Target &llvm::getTheWebAssemblyTarget64() {
   return TheWebAssemblyTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetInfo() {
   RegisterTarget<Triple::wasm32> X(getTheWebAssemblyTarget32(), "wasm32",
                                    "WebAssembly 32-bit", "WebAssembly");
   RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index c61ed3c7d5d81..b43b7dbfc36be 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -759,7 +760,8 @@ INITIALIZE_PASS(WebAssemblyAsmPrinter, "webassembly-asm-printer",
                 "WebAssembly Assmebly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmPrinter() {
   RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
   RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index adb446b20ebf5..6e551e5c8ee4e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
@@ -53,7 +54,8 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
       getTheWebAssemblyTarget32());
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 89093b2e1a3f5..33dc0a232815c 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -242,6 +242,26 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
+  auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
+    if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
+      return false;
+    // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
+    APInt MaskW =
+        APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
+    APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
+    if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
+      return false;
+    APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(MovOpc));
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return true;
+  };
+
   auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
                                unsigned MovImm) -> bool {
     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
@@ -270,6 +290,12 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
            ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
 
+  case X86::VPBLENDWrri:
+    // TODO: Add X86::VPBLENDWrmi handling
+    // TODO: Add X86::VPBLENDWYrri handling
+    // TODO: Add X86::VPBLENDWYrmi handling
+    return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b4670e270141f..1ca5fc5376f0f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8268,6 +8268,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
                              SDValue &Opnd0, SDValue &Opnd1,
                              unsigned &NumExtracts,
                              bool &IsSubAdd) {
+  using namespace SDPatternMatch;
 
   MVT VT = BV->getSimpleValueType(0);
   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
@@ -8302,14 +8303,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
     // Early exit if we cannot match that sequence.
-    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
-        Op0.getOperand(1) != Op1.getOperand(1))
-      return false;
-
-    unsigned I0 = Op0.getConstantOperandVal(1);
-    if (I0 != i)
+    if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
+        !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
       return false;
 
     // We found a valid add/sub node, make sure its the same opcode as previous
@@ -8319,16 +8314,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     Opc[i % 2] = Opcode;
 
     // Update InVec0 and InVec1.
-    if (InVec0.isUndef()) {
+    if (InVec0.isUndef())
       InVec0 = Op0.getOperand(0);
-      if (InVec0.getSimpleValueType() != VT)
-        return false;
-    }
-    if (InVec1.isUndef()) {
+    if (InVec1.isUndef())
       InVec1 = Op1.getOperand(0);
-      if (InVec1.getSimpleValueType() != VT)
-        return false;
-    }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.
@@ -9614,13 +9603,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 // TODO: Detect subvector broadcast here instead of DAG combine?
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
+                                      SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
-
-  assert((ResVT.is256BitVector() ||
-          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+  assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
+         "Value type must be 256-/512-bit wide");
 
   unsigned NumOperands = Op.getNumOperands();
   unsigned NumFreezeUndef = 0;
@@ -9688,13 +9676,11 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
 // zeros) of the result of a node that already zeros all upper bits of
 // k-register.
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
-static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
-
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
@@ -9766,19 +9752,18 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
+  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
-    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
-
-  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
-         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
-          Op.getNumOperands() == 4)));
+    return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
-
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() &&
+          (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
+  return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
 }
 
 //===----------------------------------------------------------------------===//
@@ -43316,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
-// We are looking for a shuffle where both sources are concatenated with undef
-// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
-// if we can express this as a single-source shuffle, that's preferable.
-static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
-                                           SelectionDAG &DAG,
-                                           const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
-  if (!VT.is128BitVector() && !VT.is256BitVector())
-    return SDValue();
-
-  if (VT.getVectorElementType() != MVT::i32 &&
-      VT.getVectorElementType() != MVT::i64 &&
-      VT.getVectorElementType() != MVT::f32 &&
-      VT.getVectorElementType() != MVT::f64)
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Check that both sources are concats with undef.
-  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
-      N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
-      N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
-      !N1.getOperand(1).isUndef())
-    return SDValue();
-
-  // Construct the new shuffle mask. Elements from the first source retain their
-  // index, but elements from the second source no longer need to skip an undef.
-  SmallVector<int, 8> Mask;
-  int NumElts = VT.getVectorNumElements();
-
-  auto *SVOp = cast<ShuffleVectorSDNode>(N);
-  for (int Elt : SVOp->getMask())
-    Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
-
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
-                               N1.getOperand(0));
-  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
-}
-
 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
 /// low half of each source vector and does not set any high half elements in
 /// the destination vector, narrow the shuffle to half its original size.
@@ -43416,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
     return LD;
 
-  // For AVX2, we sometimes want to combine
-  // (vector_shuffle <mask> (concat_vectors t1, undef)
-  //                        (concat_vectors t2, undef))
-  // Into:
-  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
-  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
-  if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
-    return ShufConcat;
-
   if (isTargetShuffle(N->getOpcode())) {
     SDValue Op(N, 0);
     if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
@@ -45517,6 +45448,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
 
   if (!DCI.isBeforeLegalizeOps())
@@ -45530,15 +45462,6 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
 
-  if (!Op.hasOneUse())
-    return SDValue();
-
-  // Look for logic ops.
-  if (Op.getOpcode() != ISD::AND &&
-      Op.getOpcode() != ISD::OR &&
-      Op.getOpcode() != ISD::XOR)
-    return SDValue();
-
   // Make sure we have a bitcast between mask registers and a scalar type.
   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
         DstVT.isScalarInteger()) &&
@@ -45546,18 +45469,18 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
         SrcVT.isScalarInteger()))
     return SDValue();
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS, RHS;
 
-  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
-      LHS.getOperand(0).getValueType() == DstVT)
-    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
-                       DAG.getBitcast(DstVT, RHS));
+  // Look for logic ops.
+  if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
+    return SDValue();
 
-  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
-      RHS.getOperand(0).getValueType() == DstVT)
+  // If either operand was bitcast from DstVT, then perform logic with DstVT (at
+  // least one of the getBitcast() will fold away).
+  if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
+      sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
-                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+                       DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
 
   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
   // Most of these have to move a constant from the scalar domain anyway.
@@ -46081,22 +46004,18 @@ static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
 // Given a ABS node, detect the following pattern:
 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
 // This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
-  SDValue AbsOp1 = Abs->getOperand(0);
-  if (AbsOp1.getOpcode() != ISD::SUB)
-    return false;
-
-  Op0 = AbsOp1.getOperand(0);
-  Op1 = AbsOp1.getOperand(1);
+static bool detectZextAbsDiff(SDValue Abs, SDValue &Op0, SDValue &Op1) {
+  using namespace SDPatternMatch;
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
-  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
-      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
-      Op1.getOpcode() != ISD::ZERO_EXTEND ||
-      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
-    return false;
-
-  return true;
+  EVT SrcVT0, SrcVT1;
+  return sd_match(
+             Abs,
+             m_UnaryOp(ISD::ABS,
+                       m_Sub(m_AllOf(m_Value(Op0), m_ZExt(m_VT(SrcVT0))),
+                             m_AllOf(m_Value(Op1), m_ZExt(m_VT(SrcVT1)))))) &&
+         SrcVT0.getVectorElementType() == MVT::i8 &&
+         SrcVT1.getVectorElementType() == MVT::i8;
 }
 
 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
@@ -46478,6 +46397,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Match shuffle + add pyramid.
   ISD::NodeType BinOp;
   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+  if (!Root)
+    return SDValue();
 
   // The operand is expected to be zero extended from i8
   // (verified in detectZextAbsDiff).
@@ -46487,16 +46408,11 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Also the sign extend is basically zero extend
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
-  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-               Root.getOpcode() == ISD::ZERO_EXTEND ||
-               Root.getOpcode() == ISD::ANY_EXTEND))
+  if (Root.getOpcode() == ISD::SIGN_EXTEND ||
+      Root.getOpcode() == ISD::ZERO_EXTEND ||
+      Root.getOpcode() == ISD::ANY_EXTEND)
     Root = Root.getOperand(0);
 
-  // If there was a match, we want Root to be a select that is the root of an
-  // abs-diff pattern.
-  if (!Root || Root.getOpcode() != ISD::ABS)
-    return SDValue();
-
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
@@ -47675,27 +47591,19 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
 static SDValue combineLogicBlendIntoConditionalNegate(
     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   EVT MaskVT = Mask.getValueType();
   assert(MaskVT.isInteger() &&
          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
          "Mask must be zero/all-bits");
 
-  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
+      !DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
     return SDValue();
-  if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
-    return SDValue();
-
-  auto IsNegV = [](SDNode *N, SDValue V) {
-    return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
-           ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
-  };
 
   SDValue V;
-  if (IsNegV(Y.getNode(), X))
-    V = X;
-  else if (IsNegV(X.getNode(), Y))
-    V = Y;
-  else
+  if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
+      !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))
     return SDValue();
 
   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
@@ -50931,7 +50839,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 }
 
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
-static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL,
+                                      SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
 
   MVT VT = N->getSimpleValueType(0);
@@ -50953,7 +50862,7 @@ static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
 
   X = DAG.getBitcast(VT, X);
   Y = DAG.getBitcast(VT, Y);
-  return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
+  return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
 }
 
 /// Try to fold:
@@ -51317,7 +51226,8 @@ static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
-static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
@@ -51347,7 +51257,6 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
       Y = Op1;
     }
     if (X && Y) {
-      SDLoc DL(N);
       SDValue Sra =
           getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
                                      VT.getScalarSizeInBits() - 1, DAG);
@@ -51370,7 +51279,6 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
-  SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countr_one();
   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
@@ -51401,13 +51309,11 @@ static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
 
 /// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
 /// This undoes the inverse fold performed in InstCombine
-static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG) {
-
+static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL,
+                                            SelectionDAG &DAG) {
   using namespace llvm::SDPatternMatch;
   MVT VT = N->getSimpleValueType(0);
-  SDLoc DL(N);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.hasAndNot(SDValue(N, 0)))
+  if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
     return SDValue();
 
   SDValue X, Y, Z;
@@ -51934,16 +51840,16 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
-  if (SDValue R = combineAndNotIntoANDNP(N, DAG))
+  if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
     return R;
 
-  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
+  if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
     return ShiftRight;
 
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
-  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, DAG))
+  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
     return R;
 
   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
@@ -52176,36 +52082,14 @@ static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL,
   return DAG.getNode(ISD::OR, DL, VT, X, Y);
 }
 
-// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
+// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Waiting for ANDNP combine allows other combines to happen that prevent
+// matching.
 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
-  if (N->getOpcode() != ISD::OR)
-    return false;
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Canonicalize AND to LHS.
-  if (N1.getOpcode() == ISD::AND)
-    std::swap(N0, N1);
-
-  // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
-  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
-    return false;
-
-  Mask = N1.getOperand(0);
-  X = N1.getOperand(1);
-
-  // Check to see if the mask appeared in both the AND and ANDNP.
-  if (N0.getOperand(0) == Mask)
-    Y = N0.getOperand(1);
-  else if (N0.getOperand(1) == Mask)
-    Y = N0.getOperand(0);
-  else
-    return false;
-
-  // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
-  // ANDNP combine allows other combines to happen that prevent matching.
-  return true;
+  using namespace SDPatternMatch;
+  return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
+                          m_And(m_Deferred(Mask), m_Value(Y))));
 }
 
 // Try to fold:
@@ -59687,16 +59571,6 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL,
 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget &Subtarget) {
-  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
-  // eventually get combined/lowered into ANDNP) with a concatenated operand,
-  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
-  // We let generic combining take over from there to simplify the
-  // insert/extract and 'not'.
-  // This pattern emerges during AVX1 legalization. We handle it before lowering
-  // to avoid complications like splitting constant vector loads.
-
-  // Capture the original wide type in the likely case that we need to bitcast
-  // back to this type.
   if (!N->getValueType(0).isSimple())
     return SDValue();
 
@@ -59712,8 +59586,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc DL(N);
 
-  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
-      TLI.isTypeLegal(InVecVT) &&
+  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
+  // eventually get combined/lowered into ANDNP) with a concatenated operand,
+  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
+  // We let generic combining take over from there to simplify the
+  // insert/extract and 'not'.
+  // This pattern emerges during AVX1 legalization. We handle it before lowering
+  // to avoid complications like splitting constant vector loads.
+  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [](SDValue V) {
       V = peekThroughBitcasts(V);
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index 2c1f9a5746e38..e9081a4ae4e72 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 
 using namespace llvm;
@@ -31,6 +32,15 @@ STATISTIC(MeetsUnwindV2Criteria,
 STATISTIC(FailsUnwindV2Criteria,
           "Number of functions that fail Unwind v2 criteria");
 
+static cl::opt<unsigned> MaximumUnwindCodes(
+    "x86-wineh-unwindv2-max-unwind-codes", cl::Hidden,
+    cl::desc("Maximum number of unwind codes permitted in each unwind info."),
+    cl::init(UINT8_MAX));
+
+static cl::opt<unsigned>
+    ForceMode("x86-wineh-unwindv2-force-mode", cl::Hidden,
+              cl::desc("Overwrites the Unwind v2 mode for testing purposes."));
+
 namespace {
 
 class X86WinEHUnwindV2 : public MachineFunctionPass {
@@ -44,10 +54,12 @@ class X86WinEHUnwindV2 : public MachineFunctionPass {
   StringRef getPassName() const override { return "WinEH Unwind V2"; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
-  bool rejectCurrentFunction() const {
-    FailsUnwindV2Criteria++;
-    return false;
-  }
+
+private:
+  /// Rejects the current function due to an internal error within LLVM.
+  static bool rejectCurrentFunctionInternalError(const MachineFunction &MF,
+                                                 WinX64EHUnwindV2Mode Mode,
+                                                 StringRef Reason);
 };
 
 enum class FunctionState {
@@ -69,8 +81,21 @@ FunctionPass *llvm::createX86WinEHUnwindV2Pass() {
   return new X86WinEHUnwindV2();
 }
 
+DebugLoc findDebugLoc(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB)
+    if (MI.getDebugLoc())
+      return MI.getDebugLoc();
+
+  return DebugLoc::getUnknown();
+}
+
 bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getFunction().getParent()->getModuleFlag("winx64-eh-unwindv2"))
+  WinX64EHUnwindV2Mode Mode =
+      ForceMode.getNumOccurrences()
+          ? static_cast<WinX64EHUnwindV2Mode>(ForceMode.getValue())
+          : MF.getFunction().getParent()->getWinX64EHUnwindV2Mode();
+
+  if (Mode == WinX64EHUnwindV2Mode::Disabled)
     return false;
 
   // Current state of processing the function. We'll assume that all functions
@@ -80,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
   // Prolog information.
   SmallVector<int64_t> PushedRegs;
   bool HasStackAlloc = false;
+  unsigned ApproximatePrologCodeCount = 0;
 
   // Requested changes.
   SmallVector<MachineInstr *> UnwindV2StartLocations;
@@ -99,6 +125,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_PushReg:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_PushReg outside of prolog");
+        ApproximatePrologCodeCount++;
         PushedRegs.push_back(MI.getOperand(0).getImm());
         break;
 
@@ -106,9 +133,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_SetFrame:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+        // Assume a large alloc...
+        ApproximatePrologCodeCount +=
+            (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
         HasStackAlloc = true;
         break;
 
+      case X86::SEH_SaveReg:
+      case X86::SEH_SaveXMM:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_SaveXMM or SEH_SaveReg outside of prolog");
+        // Assume a big reg...
+        ApproximatePrologCodeCount += 3;
+        break;
+
+      case X86::SEH_PushFrame:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_PushFrame outside of prolog");
+        ApproximatePrologCodeCount++;
+        break;
+
       case X86::SEH_EndPrologue:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_EndPrologue outside of prolog");
@@ -127,10 +171,16 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_EndEpilogue:
         if (State != FunctionState::InEpilog)
           llvm_unreachable("SEH_EndEpilogue outside of epilog");
-        if ((HasStackAlloc != HasStackDealloc) ||
-            (PoppedRegCount != PushedRegs.size()))
-          // Non-canonical epilog, reject the function.
-          return rejectCurrentFunction();
+        if (HasStackAlloc != HasStackDealloc)
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog made a stack allocation, "
+              "but the epilog did not deallocate it");
+        if (PoppedRegCount != PushedRegs.size())
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog pushed more registers than "
+              "the epilog popped");
 
         // If we didn't find the start location, then use the end of the
         // epilog.
@@ -145,13 +195,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         if (State == FunctionState::InEpilog) {
           // If the prolog contains a stack allocation, then the first
           // instruction in the epilog must be to adjust the stack pointer.
-          if (!HasStackAlloc || HasStackDealloc || (PoppedRegCount > 0)) {
-            return rejectCurrentFunction();
-          }
+          if (!HasStackAlloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating a stack "
+                "allocation, but the prolog did "
+                "not allocate one");
+          if (HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating the stack "
+                "allocation more than once");
+          if (PoppedRegCount > 0)
+            llvm_unreachable(
+                "Should have raised an error: either popping before "
+                "deallocating or deallocating without an allocation");
+
           HasStackDealloc = true;
         } else if (State == FunctionState::FinishedEpilog)
-          // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Unexpected mov or add instruction after the epilog");
         break;
 
       case X86::POP64r:
@@ -159,12 +222,22 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           // After the stack pointer has been adjusted, the epilog must
           // POP each register in reverse order of the PUSHes in the prolog.
           PoppedRegCount++;
-          if ((HasStackAlloc != HasStackDealloc) ||
-              (PoppedRegCount > PushedRegs.size()) ||
-              (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
-               MI.getOperand(0).getReg())) {
-            return rejectCurrentFunction();
-          }
+          if (HasStackAlloc != HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "Cannot pop registers before the stack "
+                "allocation has been deallocated");
+          if (PoppedRegCount > PushedRegs.size())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping more registers than the prolog pushed");
+          if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
+              MI.getOperand(0).getReg())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping a registers in "
+                "a different order than the "
+                "prolog pushed them");
 
           // Unwind v2 records the size of the epilog not from where we place
           // SEH_BeginEpilogue (as that contains the instruction to adjust the
@@ -176,7 +249,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           }
         } else if (State == FunctionState::FinishedEpilog)
           // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Registers are being popped after the epilog");
         break;
 
       default:
@@ -191,7 +265,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           if ((State == FunctionState::FinishedEpilog) ||
               (State == FunctionState::InEpilog))
             // Unknown instruction in or after the epilog.
-            return rejectCurrentFunction();
+            return rejectCurrentFunctionInternalError(
+                MF, Mode, "Unexpected instruction in or after the epilog");
         }
       }
     }
@@ -203,6 +278,25 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
+  MachineBasicBlock &FirstMBB = MF.front();
+  // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and
+  // that we won't be able to use the "last epilog at the end of function"
+  // optimization.
+  if (ApproximatePrologCodeCount + UnwindV2StartLocations.size() + 1 >
+      static_cast<unsigned>(MaximumUnwindCodes)) {
+    if (Mode == WinX64EHUnwindV2Mode::Required)
+      MF.getFunction().getContext().diagnose(DiagnosticInfoGenericWithLoc(
+          "Windows x64 Unwind v2 is required, but the function '" +
+              MF.getName() +
+              "' has too many unwind codes. Try splitting the function or "
+              "reducing the number of places where it exits early with a tail "
+              "call.",
+          MF.getFunction(), findDebugLoc(FirstMBB)));
+
+    FailsUnwindV2Criteria++;
+    return false;
+  }
+
   MeetsUnwindV2Criteria++;
 
   // Emit the pseudo instruction that marks the start of each epilog.
@@ -212,10 +306,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
             TII->get(X86::SEH_UnwindV2Start));
   }
   // Note that the function is using Unwind v2.
-  MachineBasicBlock &FirstMBB = MF.front();
-  BuildMI(FirstMBB, FirstMBB.front(), FirstMBB.front().getDebugLoc(),
+  BuildMI(FirstMBB, FirstMBB.front(), findDebugLoc(FirstMBB),
           TII->get(X86::SEH_UnwindVersion))
       .addImm(2);
 
   return true;
 }
+
+bool X86WinEHUnwindV2::rejectCurrentFunctionInternalError(
+    const MachineFunction &MF, WinX64EHUnwindV2Mode Mode, StringRef Reason) {
+  if (Mode == WinX64EHUnwindV2Mode::Required)
+    reportFatalInternalError("Windows x64 Unwind v2 is required, but LLVM has "
+                             "generated incompatible code in function '" +
+                             MF.getName() + "': " + Reason);
+
+  FailsUnwindV2Criteria++;
+  return false;
+}
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 57801752f170b..d36f18238f7a3 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -750,7 +751,8 @@ static MCDisassembler *createXCoreDisassembler(const Target &T,
   return new XCoreDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheXCoreTarget(),
                                          createXCoreDisassembler);
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 098d874f21490..0ef2da04171e2 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -125,7 +126,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheXCoreTarget(), createXCoreMCAsmInfo);
 
diff --git a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 8916c6ca7be74..556b31eab8b7e 100644
--- a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/XCoreTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheXCoreTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheXCoreTarget() {
   return TheXCoreTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetInfo() {
   RegisterTarget<Triple::xcore> X(getTheXCoreTarget(), "xcore", "XCore",
                                   "XCore");
 }
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index b10b3056d82b2..0426088caf244 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -295,6 +296,7 @@ INITIALIZE_PASS(XCoreAsmPrinter, "xcore-asm-printer", "XCore Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreAsmPrinter() {
   RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
 }
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 3627b81a48055..88f46c38b2f9a 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
@@ -102,7 +103,7 @@ void XCorePassConfig::addPreEmitPass() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeXCoreAsmPrinterPass(PR);
diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index 1f6cfec8edf4e..6c4e365451af0 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -62,11 +62,14 @@ class XtensaAsmParser : public MCTargetAsmParser {
 #include "XtensaGenAsmMatcher.inc"
 
   ParseStatus parseImmediate(OperandVector &Operands);
-  ParseStatus parseRegister(OperandVector &Operands, bool AllowParens = false,
-                            bool SR = false);
+  ParseStatus
+  parseRegister(OperandVector &Operands, bool AllowParens = false,
+                bool SR = false,
+                Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   ParseStatus parseOperandWithModifier(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                    bool SR = false);
+  bool
+  parseOperand(OperandVector &Operands, StringRef Mnemonic, bool SR = false,
+               Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   bool ParseInstructionWithSR(ParseInstructionInfo &Info, StringRef Name,
                               SMLoc NameLoc, OperandVector &Operands);
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
@@ -580,7 +583,8 @@ bool XtensaAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
-                                           bool AllowParens, bool SR) {
+                                           bool AllowParens, bool SR,
+                                           Xtensa::RegisterAccessType RAType) {
   SMLoc FirstS = getLoc();
   bool HadParens = false;
   AsmToken Buf[2];
@@ -624,7 +628,7 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
     return ParseStatus::NoMatch;
   }
 
-  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
     return ParseStatus::NoMatch;
 
   if (HadParens)
@@ -685,7 +689,7 @@ ParseStatus XtensaAsmParser::parseOperandWithModifier(OperandVector &Operands) {
 /// from this information, adding to Operands.
 /// If operand was parsed, returns false, else true.
 bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                                   bool SR) {
+                                   bool SR, Xtensa::RegisterAccessType RAType) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic);
@@ -699,7 +703,7 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
     return true;
 
   // Attempt to parse token as register
-  if (parseRegister(Operands, true, SR).isSuccess())
+  if (parseRegister(Operands, true, SR, RAType).isSuccess())
     return false;
 
   // Attempt to parse token as an immediate
@@ -713,6 +717,11 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
 bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
                                              StringRef Name, SMLoc NameLoc,
                                              OperandVector &Operands) {
+  Xtensa::RegisterAccessType RAType =
+      Name[0] == 'w' ? Xtensa::REGISTER_WRITE
+                     : (Name[0] == 'r' ? Xtensa::REGISTER_READ
+                                       : Xtensa::REGISTER_EXCHANGE);
+
   if ((Name.starts_with("wsr.") || Name.starts_with("rsr.") ||
        Name.starts_with("xsr.")) &&
       (Name.size() > 4)) {
@@ -728,7 +737,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     if (RegNo == 0)
       RegNo = MatchRegisterAltName(RegName);
 
-    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
       return Error(NameLoc, "invalid register name");
 
     // Parse operand
@@ -753,7 +762,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     }
 
     // Parse second operand
-    if (parseOperand(Operands, Name, true))
+    if (parseOperand(Operands, Name, true, RAType))
       return true;
   }
 
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index dbd34964db074..3b37ac88b9b17 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -119,13 +119,39 @@ struct DecodeRegister {
 };
 
 const DecodeRegister SRDecoderTable[] = {
-    {Xtensa::LBEG, 0},    {Xtensa::LEND, 1},        {Xtensa::LCOUNT, 2},
-    {Xtensa::SAR, 3},     {Xtensa::BREG, 4},        {Xtensa::SAR, 3},
-    {Xtensa::LITBASE, 5}, {Xtensa::ACCLO, 16},      {Xtensa::ACCHI, 17},
-    {Xtensa::M0, 32},     {Xtensa::M1, 33},         {Xtensa::M2, 34},
-    {Xtensa::M3, 35},     {Xtensa::WINDOWBASE, 72}, {Xtensa::WINDOWSTART, 73},
-    {Xtensa::MEMCTL, 97}, {Xtensa::VECBASE, 231},   {Xtensa::MISC0, 244},
-    {Xtensa::MISC1, 245}, {Xtensa::MISC2, 246},     {Xtensa::MISC3, 247}};
+    {Xtensa::LBEG, 0},          {Xtensa::LEND, 1},
+    {Xtensa::LCOUNT, 2},        {Xtensa::SAR, 3},
+    {Xtensa::BREG, 4},          {Xtensa::LITBASE, 5},
+    {Xtensa::ACCLO, 16},        {Xtensa::ACCHI, 17},
+    {Xtensa::M0, 32},           {Xtensa::M1, 33},
+    {Xtensa::M2, 34},           {Xtensa::M3, 35},
+    {Xtensa::WINDOWBASE, 72},   {Xtensa::WINDOWSTART, 73},
+    {Xtensa::IBREAKENABLE, 96}, {Xtensa::MEMCTL, 97},
+    {Xtensa::DDR, 104},         {Xtensa::IBREAKA0, 128},
+    {Xtensa::IBREAKA1, 129},    {Xtensa::DBREAKA0, 144},
+    {Xtensa::DBREAKA1, 145},    {Xtensa::DBREAKC0, 160},
+    {Xtensa::DBREAKC1, 161},    {Xtensa::CONFIGID0, 176},
+    {Xtensa::EPC1, 177},        {Xtensa::EPC2, 178},
+    {Xtensa::EPC3, 179},        {Xtensa::EPC4, 180},
+    {Xtensa::EPC5, 181},        {Xtensa::EPC6, 182},
+    {Xtensa::EPC7, 183},        {Xtensa::DEPC, 192},
+    {Xtensa::EPS2, 194},        {Xtensa::EPS3, 195},
+    {Xtensa::EPS4, 196},        {Xtensa::EPS5, 197},
+    {Xtensa::EPS6, 198},        {Xtensa::EPS7, 199},
+    {Xtensa::CONFIGID1, 208},   {Xtensa::EXCSAVE1, 209},
+    {Xtensa::EXCSAVE2, 210},    {Xtensa::EXCSAVE3, 211},
+    {Xtensa::EXCSAVE4, 212},    {Xtensa::EXCSAVE5, 213},
+    {Xtensa::EXCSAVE6, 214},    {Xtensa::EXCSAVE7, 215},
+    {Xtensa::CPENABLE, 224},    {Xtensa::INTERRUPT, 226},
+    {Xtensa::INTCLEAR, 227},    {Xtensa::INTENABLE, 228},
+    {Xtensa::PS, 230},          {Xtensa::VECBASE, 231},
+    {Xtensa::EXCCAUSE, 232},    {Xtensa::DEBUGCAUSE, 233},
+    {Xtensa::CCOUNT, 234},      {Xtensa::PRID, 235},
+    {Xtensa::ICOUNT, 236},      {Xtensa::ICOUNTLEVEL, 237},
+    {Xtensa::EXCVADDR, 238},    {Xtensa::CCOMPARE0, 240},
+    {Xtensa::CCOMPARE1, 241},   {Xtensa::CCOMPARE2, 242},
+    {Xtensa::MISC0, 244},       {Xtensa::MISC1, 245},
+    {Xtensa::MISC2, 246},       {Xtensa::MISC3, 247}};
 
 static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
@@ -133,12 +159,24 @@ static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo > 255)
     return MCDisassembler::Fail;
 
+  Xtensa::RegisterAccessType RAType =
+      Inst.getOpcode() == Xtensa::WSR
+          ? Xtensa::REGISTER_WRITE
+          : (Inst.getOpcode() == Xtensa::RSR ? Xtensa::REGISTER_READ
+                                             : Xtensa::REGISTER_EXCHANGE);
+
   for (unsigned i = 0; i < std::size(SRDecoderTable); i++) {
     if (SRDecoderTable[i].RegNo == RegNo) {
       MCPhysReg Reg = SRDecoderTable[i].Reg;
 
-      if (!Xtensa::checkRegister(Reg,
-                                 Decoder->getSubtargetInfo().getFeatureBits()))
+      // Handle special case. The INTERRUPT/INTSET registers use the same
+      // encoding, but INTERRUPT used for read and INTSET for write.
+      if (Reg == Xtensa::INTERRUPT && RAType == Xtensa::REGISTER_WRITE) {
+        Reg = Xtensa::INTSET;
+      }
+
+      if (!Xtensa::checkRegister(
+              Reg, Decoder->getSubtargetInfo().getFeatureBits(), RAType))
         return MCDisassembler::Fail;
 
       Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 63fed46ac411f..f48c6225827b0 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -75,10 +75,95 @@ bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) {
 }
 
 // Verify Special Register
-bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
+bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                           RegisterAccessType RAType) {
   switch (RegNo) {
   case Xtensa::BREG:
     return FeatureBits[Xtensa::FeatureBoolean];
+  case Xtensa::CCOUNT:
+  case Xtensa::CCOMPARE0:
+    if (FeatureBits[Xtensa::FeatureTimers1])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE1:
+    if (FeatureBits[Xtensa::FeatureTimers2])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE2:
+    if (FeatureBits[Xtensa::FeatureTimers3])
+      return true;
+    return false;
+  case Xtensa::CONFIGID0:
+    return RAType != Xtensa::REGISTER_EXCHANGE;
+  case Xtensa::CONFIGID1:
+    return RAType == Xtensa::REGISTER_READ;
+  case Xtensa::CPENABLE:
+    return FeatureBits[Xtensa::FeatureCoprocessor];
+  case Xtensa::DEBUGCAUSE:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeatureDebug];
+  case Xtensa::DEPC:
+  case Xtensa::EPC1:
+  case Xtensa::EXCCAUSE:
+  case Xtensa::EXCSAVE1:
+  case Xtensa::EXCVADDR:
+    return FeatureBits[Xtensa::FeatureException];
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC2:
+  case Xtensa::EPS2:
+  case Xtensa::EXCSAVE2:
+    if (FeatureBits[Xtensa::FeatureHighPriInterrupts])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC3:
+  case Xtensa::EPS3:
+  case Xtensa::EXCSAVE3:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC4:
+  case Xtensa::EPS4:
+  case Xtensa::EXCSAVE4:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC5:
+  case Xtensa::EPS5:
+  case Xtensa::EXCSAVE5:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC6:
+  case Xtensa::EPS6:
+  case Xtensa::EXCSAVE6:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC7:
+  case Xtensa::EPS7:
+  case Xtensa::EXCSAVE7:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel7])
+      return true;
+    return false;
+  case Xtensa::INTENABLE:
+    return FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTERRUPT:
+    return RAType == Xtensa::REGISTER_READ &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTSET:
+  case Xtensa::INTCLEAR:
+    return RAType == Xtensa::REGISTER_WRITE &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::ICOUNT:
+  case Xtensa::ICOUNTLEVEL:
+  case Xtensa::IBREAKENABLE:
+  case Xtensa::DDR:
+  case Xtensa::IBREAKA0:
+  case Xtensa::IBREAKA1:
+  case Xtensa::DBREAKA0:
+  case Xtensa::DBREAKA1:
+  case Xtensa::DBREAKC0:
+  case Xtensa::DBREAKC1:
+    return FeatureBits[Xtensa::FeatureDebug];
   case Xtensa::LBEG:
   case Xtensa::LEND:
   case Xtensa::LCOUNT:
@@ -99,6 +184,8 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
   case Xtensa::MISC2:
   case Xtensa::MISC3:
     return FeatureBits[Xtensa::FeatureMiscSR];
+  case Xtensa::PRID:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeaturePRID];
   case Xtensa::VECBASE:
     return FeatureBits[Xtensa::FeatureRelocatableVector];
   case Xtensa::WINDOWBASE:
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index cedc57a14f142..ec91f656bdcbd 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -55,8 +55,15 @@ bool isValidAddrOffset(int Scale, int64_t OffsetVal);
 // Check address offset for load/store instructions.
 bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset);
 
+enum RegisterAccessType {
+  REGISTER_WRITE = 1,
+  REGISTER_READ = 2,
+  REGISTER_EXCHANGE = 3
+};
+
 // Verify if it's correct to use a special register.
-bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits);
+bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                   RegisterAccessType RA);
 } // namespace Xtensa
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td
index 55977277daf8e..1dd03283e9313 100644
--- a/llvm/lib/Target/Xtensa/XtensaFeatures.td
+++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -92,3 +92,43 @@ def FeatureDataCache : SubtargetFeature<"dcache", "HasDataCache", "true",
                                         "Enable Xtensa Data Cache option">;
 def HasDataCache : Predicate<"Subtarget->hasDataCache()">,
                    AssemblerPredicate<(all_of FeatureDataCache)>;
+
+// Xtensa Interrupts Options.
+def FeatureHighPriInterrupts : SubtargetFeature<"highpriinterrupts",
+                                                "HasHighPriInterrupts", "true",
+                                                "Enable Xtensa HighPriInterrupts option">;
+def HasHighPriInterrupts : Predicate<"Subtarget->hasHighPriInterrupts()">,
+                                      AssemblerPredicate<(all_of FeatureHighPriInterrupts)>;
+
+foreach i = {3-7} in
+    def FeatureHighPriInterruptsLevel#i : SubtargetFeature<"highpriinterrupts-level"#i,
+         "HasHighPriInterruptsLevel"#i#"", "true", "Enable Xtensa HighPriInterrupts Level"#i, [FeatureHighPriInterrupts]>;
+
+def FeatureInterrupt : SubtargetFeature<"interrupt", "HasInterrupt", "true",
+                                        "Enable Xtensa Interrupt option">;
+def HasInterrupt : Predicate<"Subtarget->hasInterrupt()">,
+                              AssemblerPredicate<(all_of FeatureInterrupt)>;
+
+def FeatureException : SubtargetFeature<"exception", "HasException", "true",
+                                        "Enable Xtensa Exception option">;
+def HasException : Predicate<"Subtarget->hasException()">,
+                              AssemblerPredicate<(all_of FeatureException)>;
+
+def FeatureDebug : SubtargetFeature<"debug", "HasDebug", "true",
+                                    "Enable Xtensa Debug option">;
+def HasDebug : Predicate<"Subtarget->hasDebug()">,
+                          AssemblerPredicate<(all_of FeatureDebug)>;
+
+foreach i = {1-3} in
+    def FeatureTimers#i : SubtargetFeature<"timers"#i,
+         "HasTimers"#i#"", "true", "Enable Xtensa Timers "#i>;
+
+def FeaturePRID : SubtargetFeature<"prid", "HasPRID", "true",
+                                   "Enable Xtensa Processor ID option">;
+def HasPRID : Predicate<"Subtarget->hasPRID()">,
+                         AssemblerPredicate<(all_of FeaturePRID)>;
+
+def FeatureCoprocessor : SubtargetFeature<"coprocessor", "HasCoprocessor", "true",
+                                          "Enable Xtensa Coprocessor option">;
+def HasCoprocessor : Predicate<"Subtarget->hasCoprocessor()">,
+                                AssemblerPredicate<(all_of FeatureCoprocessor)>;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 9a9424f916996..7e9fcd7058c20 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -499,6 +499,18 @@ def EXTW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
   let hasSideEffects = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Illegal instructions
+//===----------------------------------------------------------------------===//
+
+def ILL : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                    "ill", []> {
+  let m = 0x0;
+  let n = 0x0;
+  let r = 0;
+  let s = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Processor control instructions
 //===----------------------------------------------------------------------===//
@@ -1044,6 +1056,109 @@ let Predicates = [HasRegionProtection] in {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Debug instructions
+//===----------------------------------------------------------------------===//
+
+let isBarrier = 1, isTerminator = 1 in {
+  def BREAK : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$s, uimm4:$t),
+                      "break\t$s, $t", []>, Requires<[HasDebug]> {
+    let r = 0x04;
+  }
+
+  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm),
+                         "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {
+    bits<4> imm;
+
+    let r = 0xf;
+    let s = imm;
+    let t = 0x2;
+  }
+}
+
+def : InstAlias<"_break.n\t$imm", (BREAK_N uimm4:$imm)>;
+
+def : Pat<(trap), (BREAK (i32 1), (i32 15))>;
+
+// Load instruction
+def LDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$s), (ins),
+                       "lddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xe;
+  let mayLoad = 1;
+}
+
+// Store instruction
+def SDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins AR:$s),
+                       "sddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xf;
+  let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Exception feature instructions
+//===----------------------------------------------------------------------===//
+
+def EXCW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "excw", []>, Requires<[HasException]> {
+  let r = 0x2;
+  let s = 0x0;
+  let t = 0x8;
+}
+
+def RFDE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "rfde", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x2;
+  let t = 0x0;
+}
+
+
+def RFE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                  "rfe", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+def SYSCALL : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                      "syscall", []>, Requires<[HasException]> {
+  let r = 0x5;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// Interrupt feature instructions
+//===----------------------------------------------------------------------===//
+
+def RSIL : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$t), (ins uimm4:$imm),
+                   "rsil\t$t, $imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x6;
+  let s = imm{3-0};
+}
+
+def WAITI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                   "waiti\t$imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x7;
+  let s = imm{3-0};
+  let t = 0;
+}
+
+def RFI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                  "rfi\t$imm", []>, Requires<[HasHighPriInterrupts]> {
+  bits<4> imm;
+
+  let r = 0x3;
+  let s = imm{3-0};
+  let t = 0x1;
+}
+
 //===----------------------------------------------------------------------===//
 // DSP Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index c54e2556ba11f..7d44029124344 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -91,9 +91,111 @@ def LITBASE : SRReg<5, "litbase", ["LITBASE", "5"]>;
 def WINDOWBASE : SRReg<72, "windowbase", ["WINDOWBASE", "72"]>;
 def WINDOWSTART : SRReg<73, "windowstart", ["WINDOWSTART", "73"]>;
 
+// Instuction breakpoint enable register
+def IBREAKENABLE : SRReg<96, "ibreakenable", ["IBREAKENABLE", "96"]>;
+
 // Memory Control Register
 def MEMCTL : SRReg<97, "memctl", ["MEMCTL", "97"]>;
 
+def DDR : SRReg<104, "ddr", ["DDR", "104"]>;
+
+// Instuction break address register 0
+def IBREAKA0 : SRReg<128, "ibreaka0", ["IBREAKA0", "128"]>;
+
+// Instuction break address register 1
+def IBREAKA1 : SRReg<129, "ibreaka1", ["IBREAKA1", "129"]>;
+
+// Data break address register 0
+def DBREAKA0 : SRReg<144, "dbreaka0", ["DBREAKA0", "144"]>;
+
+// Data break address register 1
+def DBREAKA1 : SRReg<145, "dbreaka1", ["DBREAKA1", "145"]>;
+
+// Data breakpoint control register 0
+def DBREAKC0 : SRReg<160, "dbreakc0", ["DBREAKC0", "160"]>;
+
+// Data breakpoint control register 1
+def DBREAKC1 : SRReg<161, "dbreakc1", ["DBREAKC1", "161"]>;
+
+def CONFIGID0 : SRReg<176, "configid0", ["CONFIGID0", "176"]>;
+
+// Exception PC1
+def EPC1 : SRReg<177, "epc1", ["EPC1", "177"]>;
+
+// Exception PC2
+def EPC2 : SRReg<178, "epc2", ["EPC2", "178"]>;
+
+// Exception PC3
+def EPC3 : SRReg<179, "epc3", ["EPC3", "179"]>;
+
+// Exception PC4
+def EPC4 : SRReg<180, "epc4", ["EPC4", "180"]>;
+
+// Exception PC5
+def EPC5 : SRReg<181, "epc5", ["EPC5", "181"]>;
+
+// Exception PC6
+def EPC6 : SRReg<182, "epc6", ["EPC6", "182"]>;
+
+// Exception PC7
+def EPC7 : SRReg<183, "epc7", ["EPC7", "183"]>;
+
+def DEPC : SRReg<192, "depc", ["DEPC", "192"]>;
+def EPS2 : SRReg<194, "eps2", ["EPS2", "194"]>;
+def EPS3 : SRReg<195, "eps3", ["EPS3", "195"]>;
+def EPS4 : SRReg<196, "eps4", ["EPS4", "196"]>;
+def EPS5 : SRReg<197, "eps5", ["EPS5", "197"]>;
+def EPS6 : SRReg<198, "eps6", ["EPS6", "198"]>;
+def EPS7 : SRReg<199, "eps7", ["EPS7", "199"]>;
+
+def CONFIGID1 : SRReg<208, "configid1", ["CONFIGID1", "208"]>;
+
+def EXCSAVE1 : SRReg<209, "excsave1", ["EXCSAVE1", "209"]>;
+def EXCSAVE2 : SRReg<210, "excsave2", ["EXCSAVE2", "210"]>;
+def EXCSAVE3 : SRReg<211, "excsave3", ["EXCSAVE3", "211"]>;
+def EXCSAVE4 : SRReg<212, "excsave4", ["EXCSAVE4", "212"]>;
+def EXCSAVE5 : SRReg<213, "excsave5", ["EXCSAVE5", "213"]>;
+def EXCSAVE6 : SRReg<214, "excsave6", ["EXCSAVE6", "214"]>;
+def EXCSAVE7 : SRReg<215, "excsave7", ["EXCSAVE7", "215"]>;
+
+def CPENABLE : SRReg<224, "cpenable", ["CPENABLE", "224"]>;
+
+// Interrupt enable mask register
+def INTERRUPT : SRReg<226, "interrupt", ["INTERRUPT", "226"]>;
+
+def INTSET : SRReg<226, "intset", ["INTSET"]>;
+
+def INTCLEAR : SRReg<227, "intclear", ["INTCLEAR", "227"]>;
+
+def INTENABLE : SRReg<228, "intenable", ["INTENABLE", "228"]>;
+
+// Processor State
+def PS : SRReg<230, "ps", ["PS", "230"]>;
+
+def EXCCAUSE : SRReg<232, "exccause", ["EXCCAUSE", "232"]>;
+
+// Cause of last debug exception register
+def DEBUGCAUSE : SRReg<233, "debugcause", ["DEBUGCAUSE", "233"]>;
+
+// Processor Clock Count Register
+def CCOUNT : SRReg<234, "ccount", ["CCOUNT", "234"]>;
+
+// Processor ID Register
+def PRID : SRReg<235, "prid", ["PRID", "235"]>;
+
+def ICOUNT : SRReg<236, "icount", ["ICOUNT", "236"]>;
+def ICOUNTLEVEL : SRReg<237, "icountlevel", ["ICOUNTLEVEL", "237"]>;
+def EXCVADDR : SRReg<238, "excvaddr", ["EXCVADDR", "238"]>;
+
+// Cycle number to interrupt register 0
+def CCOMPARE0 : SRReg<240, "ccompare0", ["CCOMPARE0", "240"]>;
+
+// Cycle number to interrupt register 1
+def CCOMPARE1 : SRReg<241, "ccompare1", ["CCOMPARE1", "241"]>;
+
+// Cycle number to interrupt register 2
+def CCOMPARE2 : SRReg<242, "ccompare2", ["CCOMPARE2", "242"]>;
+
 // Vector base register
 def VECBASE : SRReg<231, "vecbase", ["VECBASE", "231"]>;
 
@@ -116,8 +218,13 @@ def MR23 :  RegisterClass<"Xtensa", [i32], 32, (add M2, M3)>;
 def MR   :  RegisterClass<"Xtensa", [i32], 32, (add MR01, MR23)>;
 
 def SR :  RegisterClass<"Xtensa", [i32], 32, (add
-  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR, WINDOWBASE, WINDOWSTART,
-  MEMCTL, VECBASE, MISC0, MISC1, MISC2, MISC3)>;
+  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR,
+  WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, DDR, IBREAKA0, IBREAKA1,
+  DBREAKA0, DBREAKA1, DBREAKC0, DBREAKC1, CONFIGID0, EPC1, EPC2, EPC3, EPC4, EPC5,
+  EPC6, EPC7, DEPC, EPS2, EPS3, EPS4, EPS5, EPS6, EPS7, CONFIGID1, EXCSAVE1, EXCSAVE2,
+  EXCSAVE3, EXCSAVE4, EXCSAVE5, EXCSAVE6, EXCSAVE7, CPENABLE, INTERRUPT, INTSET, INTCLEAR, INTENABLE,
+  PS, VECBASE, EXCCAUSE, DEBUGCAUSE, CCOUNT, PRID, ICOUNT, ICOUNTLEVEL, EXCVADDR, CCOMPARE0,
+  CCOMPARE1, CCOMPARE2, MISC0, MISC1, MISC2, MISC3)>;
 
 //===----------------------------------------------------------------------===//
 // Boolean registers
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
index 9909fb9ff4b37..da4e14a53eef3 100644
--- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -82,6 +82,15 @@ class XtensaSubtarget : public XtensaGenSubtargetInfo {
   bool hasMiscSR() const { return HasMiscSR; }
   bool hasExtendedL32R() const { return HasExtendedL32R; }
   bool hasDataCache() const { return HasDataCache; }
+  bool hasHighPriInterrupts() const { return HasHighPriInterrupts; }
+  bool hasHighPriInterruptsLevel3() const { return HasHighPriInterruptsLevel3; }
+  bool hasHighPriInterruptsLevel4() const { return HasHighPriInterruptsLevel4; }
+  bool hasHighPriInterruptsLevel5() const { return HasHighPriInterruptsLevel5; }
+  bool hasHighPriInterruptsLevel6() const { return HasHighPriInterruptsLevel6; }
+  bool hasHighPriInterruptsLevel7() const { return HasHighPriInterruptsLevel7; }
+  bool hasInterrupt() const { return HasInterrupt; }
+  bool hasException() const { return HasException; }
+
   bool isWindowedABI() const { return hasWindowed(); }
 
   // Automatically generated by tblgen.
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index e13c6e6d28c2b..4a2523440f0f0 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -60,7 +60,7 @@ uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
   ExtensionSet FeatureBits;
   for (const StringRef Feature : Features) {
     std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
-    if (!FMV) {
+    if (!FMV && Feature.starts_with('+')) {
       if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
         FMV = lookupFMVByID(Info->ID);
     }
@@ -181,7 +181,8 @@ std::optional<AArch64::FMVInfo> AArch64::parseFMVExtension(StringRef FMVExt) {
 std::optional<AArch64::ExtensionInfo>
 AArch64::targetFeatureToExtension(StringRef TargetFeature) {
   for (const auto &E : Extensions)
-    if (TargetFeature == E.PosTargetFeature)
+    if (TargetFeature == E.PosTargetFeature ||
+        TargetFeature == E.NegTargetFeature)
       return E;
   return {};
 }
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f62361d334704..8c156c93ba8d1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -719,9 +719,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
     if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
       return false;
 
-    // Ignore debug info so that's not counted against MaxInstrsToScan.
-    // Otherwise debug info could affect codegen.
-    if (!isa<DbgInfoIntrinsic>(Inst) && ++NumScanned > MaxInstrsToScan)
+    if (++NumScanned > MaxInstrsToScan)
       return false;
   }
 
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 050eed376ed3f..a2548258ddaf0 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -242,8 +242,6 @@ Constant *
 AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
                           Value &Obj, Type &Ty, const TargetLibraryInfo *TLI,
                           const DataLayout &DL, AA::RangeTy *RangePtr) {
-  if (isa<AllocaInst>(Obj))
-    return UndefValue::get(&Ty);
   if (Constant *Init = getInitialValueOfAllocation(&Obj, TLI, &Ty))
     return Init;
   auto *GV = dyn_cast<GlobalVariable>(&Obj);
@@ -3614,6 +3612,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       if (SimplifyAllLoads)
         getAssumedSimplified(IRPosition::value(I), nullptr,
                              UsedAssumedInformation, AA::Intraprocedural);
+      getOrCreateAAFor<AAInvariantLoadPointer>(
+          IRPosition::value(*LI->getPointerOperand()));
       getOrCreateAAFor<AAAddressSpace>(
           IRPosition::value(*LI->getPointerOperand()));
     } else {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 3799a696f67af..5cb8f888354bd 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -191,6 +191,7 @@ PIPE_OPERATOR(AAInterFnReachability)
 PIPE_OPERATOR(AAPointerInfo)
 PIPE_OPERATOR(AAAssumptionInfo)
 PIPE_OPERATOR(AAUnderlyingObjects)
+PIPE_OPERATOR(AAInvariantLoadPointer)
 PIPE_OPERATOR(AAAddressSpace)
 PIPE_OPERATOR(AAAllocationInfo)
 PIPE_OPERATOR(AAIndirectCallInfo)
@@ -12533,6 +12534,342 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
 };
 } // namespace
 
+/// --------------------- Invariant Load Pointer -------------------------------
+namespace {
+
+struct AAInvariantLoadPointerImpl
+    : public StateWrapper<BitIntegerState<uint8_t, 15>,
+                          AAInvariantLoadPointer> {
+
+  enum {
+    // pointer does not alias within the bounds of the function
+    IS_NOALIAS = 1 << 0,
+    // pointer is not involved in any effectful instructions within the bounds
+    // of the function
+    IS_NOEFFECT = 1 << 1,
+    // loads are invariant within the bounds of the function
+    IS_LOCALLY_INVARIANT = 1 << 2,
+    // memory lifetime is constrained within the bounds of the function
+    IS_LOCALLY_CONSTRAINED = 1 << 3,
+
+    IS_BEST_STATE = IS_NOALIAS | IS_NOEFFECT | IS_LOCALLY_INVARIANT |
+                    IS_LOCALLY_CONSTRAINED,
+  };
+  static_assert(getBestState() == IS_BEST_STATE, "Unexpected best state");
+
+  using Base =
+      StateWrapper<BitIntegerState<uint8_t, 15>, AAInvariantLoadPointer>;
+
+  // the BitIntegerState is optimistic about IS_NOALIAS and IS_NOEFFECT, but
+  // pessimistic about IS_KNOWN_INVARIANT
+  AAInvariantLoadPointerImpl(const IRPosition &IRP, Attributor &A)
+      : Base(IRP) {}
+
+  bool isKnownInvariant() const final {
+    return isKnownLocallyInvariant() && isKnown(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isKnownLocallyInvariant() const final {
+    if (isKnown(IS_LOCALLY_INVARIANT))
+      return true;
+    return isKnown(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  bool isAssumedInvariant() const final {
+    return isAssumedLocallyInvariant() && isAssumed(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isAssumedLocallyInvariant() const final {
+    if (isAssumed(IS_LOCALLY_INVARIANT))
+      return true;
+    return isAssumed(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    Changed |= updateNoAlias(A);
+    if (requiresNoAlias() && !isAssumed(IS_NOALIAS))
+      return indicatePessimisticFixpoint();
+
+    Changed |= updateNoEffect(A);
+
+    Changed |= updateLocalInvariance(A);
+
+    return Changed;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!isKnownInvariant())
+      return ChangeStatus::UNCHANGED;
+
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    const Value *Ptr = &getAssociatedValue();
+    const auto TagInvariantLoads = [&](const Use &U, bool &) {
+      if (U.get() != Ptr)
+        return true;
+      auto *I = dyn_cast<Instruction>(U.getUser());
+      if (!I)
+        return true;
+
+      // Ensure that we are only changing uses from the corresponding callgraph
+      // SSC in the case that the AA isn't run on the entire module
+      if (!A.isRunOn(I->getFunction()))
+        return true;
+
+      if (I->hasMetadata(LLVMContext::MD_invariant_load))
+        return true;
+
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        LI->setMetadata(LLVMContext::MD_invariant_load,
+                        MDNode::get(LI->getContext(), {}));
+        Changed = ChangeStatus::CHANGED;
+      }
+      return true;
+    };
+
+    (void)A.checkForAllUses(TagInvariantLoads, *this, *Ptr);
+    return Changed;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr(Attributor *) const override {
+    if (isKnownInvariant())
+      return "load-invariant pointer";
+    return "non-invariant pointer";
+  }
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {}
+
+private:
+  /// Indicate that noalias is required for the pointer to be invariant.
+  bool requiresNoAlias() const {
+    switch (getPositionKind()) {
+    default:
+      // Conservatively default to require noalias.
+      return true;
+    case IRP_FLOAT:
+    case IRP_RETURNED:
+    case IRP_CALL_SITE:
+      return false;
+    case IRP_CALL_SITE_RETURNED: {
+      const auto &CB = cast<CallBase>(getAnchorValue());
+      return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+          &CB, /*MustPreserveNullness=*/false);
+    }
+    case IRP_ARGUMENT: {
+      const Function *F = getAssociatedFunction();
+      assert(F && "no associated function for argument");
+      return !isCallableCC(F->getCallingConv());
+    }
+    }
+  }
+
+  bool isExternal() const {
+    const Function *F = getAssociatedFunction();
+    if (!F)
+      return true;
+    return isCallableCC(F->getCallingConv()) &&
+           getPositionKind() != IRP_CALL_SITE_RETURNED;
+  }
+
+  ChangeStatus updateNoAlias(Attributor &A) {
+    if (isKnown(IS_NOALIAS) || !isAssumed(IS_NOALIAS))
+      return ChangeStatus::UNCHANGED;
+
+    // Try to use AANoAlias.
+    if (const auto *ANoAlias = A.getOrCreateAAFor<AANoAlias>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (ANoAlias->isKnownNoAlias()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      if (!ANoAlias->isAssumedNoAlias()) {
+        removeAssumedBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // Try to infer noalias from argument attribute, since it is applicable for
+    // the duration of the function.
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->hasNoAliasAttr()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Noalias information is not provided, and cannot be inferred,
+      // so we conservatively assume the pointer aliases.
+      removeAssumedBits(IS_NOALIAS);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateNoEffect(Attributor &A) {
+    if (isKnown(IS_NOEFFECT) || !isAssumed(IS_NOEFFECT))
+      return ChangeStatus::UNCHANGED;
+
+    if (!getAssociatedFunction())
+      return indicatePessimisticFixpoint();
+
+    const auto HasNoEffectLoads = [&](const Use &U, bool &) {
+      const auto *LI = dyn_cast<LoadInst>(U.getUser());
+      return !LI || !LI->mayHaveSideEffects();
+    };
+    if (!A.checkForAllUses(HasNoEffectLoads, *this, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+
+    // Try to use AAMemoryBehavior to infer readonly attribute.
+    if (const auto *AMemoryBehavior = A.getOrCreateAAFor<AAMemoryBehavior>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (!AMemoryBehavior->isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+
+      if (AMemoryBehavior->isKnownReadOnly()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->onlyReadsMemory()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Readonly information is not provided, and cannot be inferred from
+      // AAMemoryBehavior.
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateLocalInvariance(Attributor &A) {
+    if (isKnown(IS_LOCALLY_INVARIANT) || !isAssumed(IS_LOCALLY_INVARIANT))
+      return ChangeStatus::UNCHANGED;
+
+    // try to infer invariance from underlying objects
+    const auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
+        getIRPosition(), this, DepClassTy::REQUIRED);
+    if (!AUO)
+      return ChangeStatus::UNCHANGED;
+
+    bool UsedAssumedInformation = false;
+    const auto IsLocallyInvariantLoadIfPointer = [&](const Value &V) {
+      if (!V.getType()->isPointerTy())
+        return true;
+      const auto *IsInvariantLoadPointer =
+          A.getOrCreateAAFor<AAInvariantLoadPointer>(IRPosition::value(V), this,
+                                                     DepClassTy::REQUIRED);
+      // Conservatively fail if invariance cannot be inferred.
+      if (!IsInvariantLoadPointer)
+        return false;
+
+      if (IsInvariantLoadPointer->isKnownLocallyInvariant())
+        return true;
+      if (!IsInvariantLoadPointer->isAssumedLocallyInvariant())
+        return false;
+
+      UsedAssumedInformation = true;
+      return true;
+    };
+    if (!AUO->forallUnderlyingObjects(IsLocallyInvariantLoadIfPointer))
+      return indicatePessimisticFixpoint();
+
+    if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
+      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+              CB, /*MustPreserveNullness=*/false)) {
+        for (const Value *Arg : CB->args()) {
+          if (!IsLocallyInvariantLoadIfPointer(*Arg))
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+
+    if (!UsedAssumedInformation) {
+      // Pointer is known and not just assumed to be locally invariant.
+      addKnownBits(IS_LOCALLY_INVARIANT);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AAInvariantLoadPointerFloating final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerFloating(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+
+struct AAInvariantLoadPointerReturned final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteReturned final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for return from call");
+
+    if (!F->isDeclaration() && !F->isIntrinsic())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+            &CB, /*MustPreserveNullness=*/false))
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    if (F->onlyReadsMemory() && F->hasNoSync())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    // At this point, the function is opaque, so we conservatively assume
+    // non-invariance.
+    indicatePessimisticFixpoint();
+  }
+};
+
+struct AAInvariantLoadPointerArgument final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for argument");
+
+    if (!isCallableCC(F->getCallingConv())) {
+      addKnownBits(IS_LOCALLY_CONSTRAINED);
+      return;
+    }
+
+    if (!F->hasLocalLinkage())
+      removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteArgument final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+} // namespace
+
 /// ------------------------ Address Space  ------------------------------------
 namespace {
 
@@ -13038,6 +13375,7 @@ const char AAInterFnReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
 const char AAAssumptionInfo::ID = 0;
 const char AAUnderlyingObjects::ID = 0;
+const char AAInvariantLoadPointer::ID = 0;
 const char AAAddressSpace::ID = 0;
 const char AAAllocationInfo::ID = 0;
 const char AAIndirectCallInfo::ID = 0;
@@ -13172,6 +13510,7 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFPClass)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInvariantLoadPointer)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAddressSpace)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAllocationInfo)
 
diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index 718452fc02764..bc98f994f490c 100644
--- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -35,8 +35,15 @@ static cl::opt<bool> ConvertToLocal(
     cl::desc("Convert available_externally into locals, renaming them "
              "to avoid link-time clashes."));
 
+static cl::opt<unsigned> ConvertGlobalVariableInAddrSpace(
+    "avail-extern-gv-in-addrspace-to-local", cl::Hidden,
+    cl::desc(
+        "Convert available_externally global variables into locals if they are "
+        "in specificed addrspace, renaming them to avoid link-time clashes."));
+
 STATISTIC(NumRemovals, "Number of functions removed");
-STATISTIC(NumConversions, "Number of functions converted");
+STATISTIC(NumFunctionsConverted, "Number of functions converted");
+STATISTIC(NumGlobalVariablesConverted, "Number of global variables converted");
 STATISTIC(NumVariables, "Number of global variables removed");
 
 void deleteFunction(Function &F) {
@@ -45,6 +52,10 @@ void deleteFunction(Function &F) {
   ++NumRemovals;
 }
 
+static std::string getNewName(Module &M, const GlobalValue &GV) {
+  return GV.getName().str() + ".__uniq" + getUniqueModuleId(&M);
+}
+
 /// Create a copy of the thinlto import, mark it local, and redirect direct
 /// calls to the copy. Only direct calls are replaced, so that e.g. indirect
 /// call function pointer tests would use the global identity of the function.
@@ -68,7 +79,7 @@ static void convertToLocalCopy(Module &M, Function &F) {
   // functions with the same name, but that just creates more trouble than
   // necessary e.g. distinguishing profiles or debugging. Instead, we append the
   // module identifier.
-  auto NewName = OrigName + ".__uniq" + getUniqueModuleId(&M);
+  std::string NewName = getNewName(M, F);
   F.setName(NewName);
   if (auto *SP = F.getSubprogram())
     SP->replaceLinkageName(MDString::get(F.getParent()->getContext(), NewName));
@@ -85,16 +96,33 @@ static void convertToLocalCopy(Module &M, Function &F) {
                        F.getAddressSpace(), OrigName, F.getParent());
   F.replaceUsesWithIf(Decl,
                       [&](Use &U) { return !isa<CallBase>(U.getUser()); });
-  ++NumConversions;
+  ++NumFunctionsConverted;
+}
+
+/// Similar to the function above, this is to convert an externally available
+/// global variable to local.
+static void convertToLocalCopy(Module &M, GlobalVariable &GV) {
+  assert(GV.hasAvailableExternallyLinkage());
+  GV.setName(getNewName(M, GV));
+  GV.setLinkage(GlobalValue::InternalLinkage);
+  ++NumGlobalVariablesConverted;
 }
 
 static bool eliminateAvailableExternally(Module &M, bool Convert) {
   bool Changed = false;
 
-  // Drop initializers of available externally global variables.
+  // If a global variable is available externally and in the specified address
+  // space, convert it to local linkage; otherwise, drop its initializer.
   for (GlobalVariable &GV : M.globals()) {
     if (!GV.hasAvailableExternallyLinkage())
       continue;
+    if (ConvertGlobalVariableInAddrSpace.getNumOccurrences() &&
+        GV.getAddressSpace() == ConvertGlobalVariableInAddrSpace &&
+        !GV.use_empty()) {
+      convertToLocalCopy(M, GV);
+      Changed = true;
+      continue;
+    }
     if (GV.hasInitializer()) {
       Constant *Init = GV.getInitializer();
       GV.setInitializer(nullptr);
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 1034ce9582152..45fa9d57e4862 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -71,7 +71,7 @@ static cl::opt<unsigned> MinCodeSizeSavings(
              "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
-    "funcspec-min-latency-savings", cl::init(40), cl::Hidden,
+    "funcspec-min-latency-savings", cl::init(20), cl::Hidden,
     cl::desc("Reject specializations whose latency savings are less than this "
              "much percent of the original function size"));
 
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index cb18b55ae2183..8d6ff72fa6061 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -717,8 +717,6 @@ static void moveFunctionData(Function &Old, Function &New,
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
       NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB));
 
-    std::vector<Instruction *> DebugInsts;
-
     for (Instruction &Val : CurrBB) {
       // Since debug-info originates from many different locations in the
       // program, it will cause incorrect reporting from a debugger if we keep
@@ -746,24 +744,12 @@ static void moveFunctionData(Function &Old, Function &New,
         continue;
       }
 
-      // From this point we are only handling call instructions.
-      CallInst *CI = cast<CallInst>(&Val);
-
-      // Collect debug intrinsics for later removal.
-      if (isa<DbgInfoIntrinsic>(CI)) {
-        DebugInsts.push_back(&Val);
-        continue;
-      }
-
       // Edit the scope of called functions inside of outlined functions.
       if (DISubprogram *SP = New.getSubprogram()) {
         DILocation *DI = DILocation::get(New.getContext(), 0, 0, SP);
         Val.setDebugLoc(DI);
       }
     }
-
-    for (Instruction *I : DebugInsts)
-      I->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index ab67a0980e0c2..86e1ebf937dbe 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -561,6 +561,8 @@ class LowerTypeTestsModule {
     return FunctionAnnotations.contains(V);
   }
 
+  void maybeReplaceComdat(Function *F, StringRef OriginalName);
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1082,6 +1084,23 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
   }
 }
 
+void LowerTypeTestsModule::maybeReplaceComdat(Function *F,
+                                              StringRef OriginalName) {
+  // For COFF we should also rename the comdat if this function also
+  // happens to be the key function. Even if the comdat name changes, this
+  // should still be fine since comdat and symbol resolution happens
+  // before LTO, so all symbols which would prevail have been selected.
+  if (F->hasComdat() && ObjectFormat == Triple::COFF &&
+      F->getComdat()->getName() == OriginalName) {
+    Comdat *OldComdat = F->getComdat();
+    Comdat *NewComdat = M.getOrInsertComdat(F->getName());
+    for (GlobalObject &GO : M.global_objects()) {
+      if (GO.getComdat() == OldComdat)
+        GO.setComdat(NewComdat);
+    }
+  }
+}
+
 // ThinLTO backend: the function F has a jump table entry; update this module
 // accordingly. isJumpTableCanonical describes the type of the jump table entry.
 void LowerTypeTestsModule::importFunction(
@@ -1115,6 +1134,7 @@ void LowerTypeTestsModule::importFunction(
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
   } else {
     F->setName(Name + ".cfi");
+    maybeReplaceComdat(F, Name);
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              F->getAddressSpace(), Name, &M);
@@ -1681,8 +1701,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
                        GlobalValue::PrivateLinkage,
                        M.getDataLayout().getProgramAddressSpace(),
                        ".cfi.jumptable", &M);
+  ArrayType *JumpTableEntryType = ArrayType::get(Int8Ty, EntrySize);
   ArrayType *JumpTableType =
-      ArrayType::get(ArrayType::get(Int8Ty, EntrySize), Functions.size());
+      ArrayType::get(JumpTableEntryType, Functions.size());
   auto JumpTable = ConstantExpr::getPointerCast(
       JumpTableFn, PointerType::getUnqual(M.getContext()));
 
@@ -1703,7 +1724,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     if (!IsJumpTableCanonical) {
       GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage
                                                 : GlobalValue::InternalLinkage;
-      GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT,
+      GlobalAlias *JtAlias = GlobalAlias::create(JumpTableEntryType, 0, LT,
                                                  F->getName() + ".cfi_jt",
                                                  CombinedGlobalElemPtr, &M);
       if (IsExported)
@@ -1728,25 +1749,14 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
-      GlobalAlias *FAlias = GlobalAlias::create(
-          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
+      GlobalAlias *FAlias =
+          GlobalAlias::create(JumpTableEntryType, 0, F->getLinkage(), "",
+                              CombinedGlobalElemPtr, &M);
       FAlias->setVisibility(F->getVisibility());
       FAlias->takeName(F);
       if (FAlias->hasName()) {
         F->setName(FAlias->getName() + ".cfi");
-        // For COFF we should also rename the comdat if this function also
-        // happens to be the key function. Even if the comdat name changes, this
-        // should still be fine since comdat and symbol resolution happens
-        // before LTO, so all symbols which would prevail have been selected.
-        if (F->hasComdat() && ObjectFormat == Triple::COFF &&
-            F->getComdat()->getName() == FAlias->getName()) {
-          Comdat *OldComdat = F->getComdat();
-          Comdat *NewComdat = M.getOrInsertComdat(F->getName());
-          for (GlobalObject &GO : M.global_objects()) {
-            if (GO.getComdat() == OldComdat)
-              GO.setComdat(NewComdat);
-          }
-        }
+        maybeReplaceComdat(F, FAlias->getName());
       }
       replaceCfiUses(F, FAlias, IsJumpTableCanonical);
       if (!F->hasLocalLinkage())
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index dda3d5a788157..7fd7d4d4f750b 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -385,8 +385,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     // line number. Real instructions generated by optimizations may not come
     // with a line number either.
     auto HasValidDbgLine = [](Instruction *J) {
-      return !isa<PHINode>(J) && !isa<DbgInfoIntrinsic>(J) &&
-             !J->isLifetimeStartOrEnd() && J->getDebugLoc();
+      return !isa<PHINode>(J) && !J->isLifetimeStartOrEnd() && J->getDebugLoc();
     };
 
     Instruction *J = &*BB->getFirstInsertionPt();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c1ce364eb1794..0a3837f2c0ce3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1787,6 +1787,34 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
+  // Ceiling division by power-of-2:
+  // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
+  // This is valid when adding (N-1) to X doesn't overflow.
+  {
+    Value *X;
+    const APInt *ShiftAmt, *Mask;
+    CmpPredicate Pred;
+
+    // Match: (X >> C) + zext((X & Mask) != 0)
+    // or:    zext((X & Mask) != 0) + (X >> C)
+    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))),
+                          m_ZExt(m_SpecificICmp(
+                              ICmpInst::ICMP_NE,
+                              m_And(m_Deferred(X), m_LowBitMask(Mask)),
+                              m_ZeroInt())))) &&
+        Mask->popcount() == *ShiftAmt) {
+
+      // Check if X + Mask doesn't overflow
+      Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
+      if (willNotOverflowUnsignedAdd(X, MaskC, I)) {
+        // (X + Mask) >> ShiftAmt
+        Value *Add = Builder.CreateNUWAdd(X, MaskC);
+        return BinaryOperator::CreateLShr(
+            Add, ConstantInt::get(X->getType(), *ShiftAmt));
+      }
+    }
+  }
+
   // (~X) + (~Y) --> -2 - (X + Y)
   {
     // To ensure we can save instructions we need to ensure that we consume both
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8c8cc0859e4af..03897117861f6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3555,32 +3555,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::vector_reverse: {
-    Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
-    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
-      auto *OldBinOp = cast<BinaryOperator>(Vec);
-      if (match(BO0, m_VecReverse(m_Value(X)))) {
-        // rev(binop rev(X), rev(Y)) --> binop X, Y
-        if (match(BO1, m_VecReverse(m_Value(Y))))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, Y,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
-        if (isSplatValue(BO1))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, BO1,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-      }
-      // rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y
-      if (match(BO1, m_VecReverse(m_Value(Y))) && isSplatValue(BO0))
-        return replaceInstUsesWith(CI,
-                                   BinaryOperator::CreateWithCopiedFlags(
-                                       OldBinOp->getOpcode(), BO0, Y, OldBinOp,
-                                       OldBinOp->getName(), II->getIterator()));
-    }
+    // Note: We canonicalize reverse after binops, so we don't need a
+    // corresponding binop case here. TODO: Consider canonicalizing
+    // reverse after fneg?
+
     // rev(unop rev(X)) --> unop X
+    Value *X;
     if (match(Vec, m_OneUse(m_UnOp(m_VecReverse(m_Value(X)))))) {
       auto *OldUnOp = cast<UnaryOperator>(Vec);
       auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c112fae351817..084e7fbaa268a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7728,6 +7728,30 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     }
   }
 
+  // icmp slt (sub nsw x, y), (add nsw x, y)  -->  icmp sgt y, 0
+  // icmp ult (sub nuw x, y), (add nuw x, y)  -->  icmp ugt y, 0
+  // icmp eq (sub nsw/nuw x, y), (add nsw/nuw x, y)   -->  icmp eq y, 0
+  {
+    Value *A, *B;
+    CmpPredicate CmpPred;
+    if (match(&I, m_c_ICmp(CmpPred, m_Sub(m_Value(A), m_Value(B)),
+                           m_c_Add(m_Deferred(A), m_Deferred(B))))) {
+      auto *I0 = cast<OverflowingBinaryOperator>(Op0);
+      auto *I1 = cast<OverflowingBinaryOperator>(Op1);
+      bool I0NUW = I0->hasNoUnsignedWrap();
+      bool I1NUW = I1->hasNoUnsignedWrap();
+      bool I0NSW = I0->hasNoSignedWrap();
+      bool I1NSW = I1->hasNoSignedWrap();
+      if ((ICmpInst::isUnsigned(Pred) && I0NUW && I1NUW) ||
+          (ICmpInst::isSigned(Pred) && I0NSW && I1NSW) ||
+          (ICmpInst::isEquality(Pred) &&
+           ((I0NUW || I0NSW) && (I1NUW || I1NSW)))) {
+        return new ICmpInst(CmpPredicate::getSwapped(CmpPred), B,
+                            ConstantInt::get(Op0->getType(), 0));
+      }
+    }
+  }
+
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() &&
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 1d208de75db3b..a9751ab03e20e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4fe900e9421f8..e2cd2a59fab91 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -4787,11 +4787,7 @@ bool InstCombinerImpl::freezeOtherUses(FreezeInst &FI) {
     MoveBefore = *MoveBeforeOpt;
   }
 
-  // Don't move to the position of a debug intrinsic.
-  if (isa<DbgInfoIntrinsic>(MoveBefore))
-    MoveBefore = MoveBefore->getNextNonDebugInstruction()->getIterator();
-  // Re-point iterator to come after any debug-info records, if we're
-  // running in "RemoveDIs" mode
+  // Re-point iterator to come after any debug-info records.
   MoveBefore.setHeadBit(false);
 
   bool Changed = false;
@@ -5582,11 +5578,9 @@ bool InstCombinerImpl::prepareWorklist(Function &F) {
       continue;
 
     unsigned NumDeadInstInBB;
-    unsigned NumDeadDbgInstInBB;
-    std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
-        removeAllNonTerminatorAndEHPadInstructions(&BB);
+    NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
 
-    MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
+    MadeIRChange |= NumDeadInstInBB != 0;
     NumDeadInst += NumDeadInstInBB;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 9351a42581ba0..3dfb36f4f1815 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -583,10 +583,6 @@ static bool functionHasLines(const Function &F, unsigned &EndLine) {
   EndLine = 0;
   for (const auto &BB : F) {
     for (const auto &I : BB) {
-      // Debug intrinsic locations correspond to the location of the
-      // declaration, not necessarily any statements or expressions.
-      if (isa<DbgInfoIntrinsic>(&I)) continue;
-
       const DebugLoc &Loc = I.getDebugLoc();
       if (!Loc)
         continue;
@@ -874,10 +870,6 @@ bool GCOVProfiler::emitProfileNotes(
         }
 
         for (const auto &I : BB) {
-          // Debug intrinsic locations correspond to the location of the
-          // declaration, not necessarily any statements or expressions.
-          if (isa<DbgInfoIntrinsic>(&I)) continue;
-
           const DebugLoc &Loc = I.getDebugLoc();
           if (!Loc)
             continue;
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index ec9f78edfeb1c..8ae6f7745a9e7 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -527,8 +527,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
         AtomicAccesses.push_back(&Inst);
       else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
         LocalLoadsAndStores.push_back(&Inst);
-      else if ((isa<CallInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) ||
-               isa<InvokeInst>(Inst)) {
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
           maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
         if (isa<MemIntrinsic>(Inst))
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index ea907af96edd9..985b9c0e53125 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -562,20 +562,7 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
     if (isLive(&I))
       continue;
 
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
-      // Avoid removing a dbg.assign that is linked to instructions because it
-      // holds information about an existing store.
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII))
-        if (!at::getAssignmentInsts(DAI).empty())
-          continue;
-      // Check if the scope of this variable location is alive.
-      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
-        continue;
-
-      // Fallthrough and drop the intrinsic.
-    } else {
-      Changed.ChangedNonDebugInstr = true;
-    }
+    Changed.ChangedNonDebugInstr = true;
 
     // Prepare to delete.
     Worklist.push_back(&I);
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 839f5933e09b0..db594e033e21e 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -44,7 +44,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c8a0479358eab..d9d05c3e8cc49 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -474,6 +474,19 @@ void GVNPass::ValueTable::add(Value *V, uint32_t Num) {
     NumberingPhi[Num] = PN;
 }
 
+/// Include the incoming memory state into the hash of the expression for the
+/// given instruction. If the incoming memory state is:
+/// * LiveOnEntry, add the value number of the entry block,
+/// * a MemoryPhi, add the value number of the basic block corresponding to that
+/// MemoryPhi,
+/// * a MemoryDef, add the value number of the memory setting instruction.
+void GVNPass::ValueTable::addMemoryStateToExp(Instruction *I, Expression &Exp) {
+  assert(MSSA && "addMemoryStateToExp should not be called without MemorySSA");
+  assert(MSSA->getMemoryAccess(I) && "Instruction does not access memory");
+  MemoryAccess *MA = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(I);
+  Exp.VarArgs.push_back(lookupOrAdd(MA));
+}
+
 uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
   // FIXME: Currently the calls which may access the thread id may
   // be considered as not accessing the memory. But this is
@@ -594,15 +607,48 @@ uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
     return V;
   }
 
+  if (MSSA && IsMSSAEnabled && AA->onlyReadsMemory(C)) {
+    Expression Exp = createExpr(C);
+    addMemoryStateToExp(C, Exp);
+    auto [V, _] = assignExpNewValueNum(Exp);
+    ValueNumbering[C] = V;
+    return V;
+  }
+
   ValueNumbering[C] = NextValueNumber;
   return NextValueNumber++;
 }
 
+/// Returns the value number for the specified load or store instruction.
+uint32_t GVNPass::ValueTable::computeLoadStoreVN(Instruction *I) {
+  if (!MSSA || !IsMSSAEnabled) {
+    ValueNumbering[I] = NextValueNumber;
+    return NextValueNumber++;
+  }
+
+  Expression Exp;
+  Exp.Ty = I->getType();
+  Exp.Opcode = I->getOpcode();
+  for (Use &Op : I->operands())
+    Exp.VarArgs.push_back(lookupOrAdd(Op));
+  addMemoryStateToExp(I, Exp);
+
+  auto [V, _] = assignExpNewValueNum(Exp);
+  ValueNumbering[I] = V;
+  return V;
+}
+
 /// Returns true if a value number exists for the specified value.
 bool GVNPass::ValueTable::exists(Value *V) const {
   return ValueNumbering.contains(V);
 }
 
+uint32_t GVNPass::ValueTable::lookupOrAdd(MemoryAccess *MA) {
+  return MSSA->isLiveOnEntryDef(MA) || isa<MemoryPhi>(MA)
+             ? lookupOrAdd(MA->getBlock())
+             : lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+}
+
 /// lookupOrAdd - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
 uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
@@ -613,6 +659,8 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
   auto *I = dyn_cast<Instruction>(V);
   if (!I) {
     ValueNumbering[V] = NextValueNumber;
+    if (isa<BasicBlock>(V))
+      NumberingBB[NextValueNumber] = cast<BasicBlock>(V);
     return NextValueNumber++;
   }
 
@@ -672,6 +720,9 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
       ValueNumbering[V] = NextValueNumber;
       NumberingPhi[NextValueNumber] = cast<PHINode>(V);
       return NextValueNumber++;
+    case Instruction::Load:
+    case Instruction::Store:
+      return computeLoadStoreVN(I);
     default:
       ValueNumbering[V] = NextValueNumber;
       return NextValueNumber++;
@@ -709,6 +760,7 @@ void GVNPass::ValueTable::clear() {
   ValueNumbering.clear();
   ExpressionNumbering.clear();
   NumberingPhi.clear();
+  NumberingBB.clear();
   PhiTranslateTable.clear();
   NextValueNumber = 1;
   Expressions.clear();
@@ -723,6 +775,8 @@ void GVNPass::ValueTable::erase(Value *V) {
   // If V is PHINode, V <--> value number is an one-to-one mapping.
   if (isa<PHINode>(V))
     NumberingPhi.erase(Num);
+  else if (isa<BasicBlock>(V))
+    NumberingBB.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -2310,15 +2364,39 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
 uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
                                                const BasicBlock *PhiBlock,
                                                uint32_t Num, GVNPass &GVN) {
+  // See if we can refine the value number by looking at the PN incoming value
+  // for the given predecessor.
   if (PHINode *PN = NumberingPhi[Num]) {
-    for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) {
-      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(I) == Pred)
-        if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
-          return TransVal;
-    }
+    if (PN->getParent() == PhiBlock)
+      for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I)
+        if (PN->getIncomingBlock(I) == Pred)
+          if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
+            return TransVal;
     return Num;
   }
 
+  if (BasicBlock *BB = NumberingBB[Num]) {
+    assert(MSSA && "NumberingBB is non-empty only when using MemorySSA");
+    // Value numbers of basic blocks are used to represent memory state in
+    // load/store instructions and read-only function calls when said state is
+    // set by a MemoryPhi.
+    if (BB != PhiBlock)
+      return Num;
+    MemoryPhi *MPhi = MSSA->getMemoryAccess(BB);
+    for (unsigned i = 0, N = MPhi->getNumIncomingValues(); i != N; ++i) {
+      if (MPhi->getIncomingBlock(i) != Pred)
+        continue;
+      MemoryAccess *MA = MPhi->getIncomingValue(i);
+      if (auto *PredPhi = dyn_cast<MemoryPhi>(MA))
+        return lookupOrAdd(PredPhi->getBlock());
+      if (MSSA->isLiveOnEntryDef(MA))
+        return lookupOrAdd(&BB->getParent()->getEntryBlock());
+      return lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+    }
+    llvm_unreachable(
+        "CFG/MemorySSA mismatch: predecessor not found among incoming blocks");
+  }
+
   // If there is any value related with Num is defined in a BB other than
   // PhiBlock, it cannot depend on a phi in PhiBlock without going through
   // a backedge. We can do an early exit in that case to save compile time.
@@ -2606,10 +2684,6 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
 /// When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets.
 bool GVNPass::processInstruction(Instruction *I) {
-  // Ignore dbg info intrinsics.
-  if (isa<DbgInfoIntrinsic>(I))
-    return false;
-
   // If the instruction can be easily simplified then do so now in preference
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
@@ -2761,6 +2835,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   ICF = &ImplicitCFT;
   this->LI = &LI;
   VN.setMemDep(MD);
+  VN.setMemorySSA(MSSA);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
   MemorySSAUpdater Updater(MSSA);
@@ -2895,8 +2970,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
 bool GVNPass::performScalarPRE(Instruction *CurInst) {
   if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
-      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
-      isa<DbgInfoIntrinsic>(CurInst))
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
     return false;
 
   // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 1c2e1531e47d8..0acbaf58a8f74 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -1166,8 +1166,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
         SI.insert(Store, VN);
       else if (auto *Call = dyn_cast<CallInst>(&I1)) {
         if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
-          if (isa<DbgInfoIntrinsic>(Intr) ||
-              Intr->getIntrinsicID() == Intrinsic::assume ||
+          if (Intr->getIntrinsicID() == Intrinsic::assume ||
               Intr->getIntrinsicID() == Intrinsic::sideeffect)
             continue;
         }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bd59caa6a959a..abb6ff1dcfe6c 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1204,10 +1204,6 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
 
     return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-    // Don't sink or hoist dbg info; it's legal, but not useful.
-    if (isa<DbgInfoIntrinsic>(I))
-      return false;
-
     // Don't sink calls which can throw.
     if (CI->mayThrow())
       return false;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 242e571c072af..4ba69034d6448 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5613,8 +5613,7 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
     }
   }
 
-  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
-         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
          "Insertion point must be a normal instruction");
 
   // Then, climb up the immediate dominator tree as far as we can go while
@@ -5627,9 +5626,6 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
   // Ignore landingpad instructions.
   while (IP->isEHPad()) ++IP;
 
-  // Ignore debug intrinsics.
-  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
-
   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
   // IP consistent across expansions and allows the previously inserted
   // instructions to be reused by subsequent expansion.
@@ -6008,9 +6004,8 @@ static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
 
   Instruction *I = Fixup.UserInst;
   Type *Ty = I->getType();
-  return Ty->isIntegerTy() &&
-         ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
-          (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
+  return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
+         (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
 }
 
 /// Rewrite all the fixup locations with new values, following the chosen
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index afa7abfea419e..a22d84dcf014d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -551,7 +551,7 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
       for (Instruction &I : *BB) {
         // These won't get into the final code - don't even try calculating the
         // cost for them.
-        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
+        if (EphValues.count(&I))
           continue;
 
         // Track this instruction's expected baseline cost when executing the
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 1e37f40fa9d52..96b156494fd91 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1146,24 +1146,24 @@ class LowerMatrixIntrinsics {
       Value *Op1;
       Value *Op2;
       MatrixTy Result;
+      IRBuilder<> Builder(Inst);
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
-        Result = VisitBinaryOperator(BinOp, SI);
+        Result = VisitBinaryOperator(BinOp, SI, Builder);
       else if (auto *Cast = dyn_cast<CastInst>(Inst))
-        Result = VisitCastInstruction(Cast, SI);
+        Result = VisitCastInstruction(Cast, SI, Builder);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
-        Result = VisitUnaryOperator(UnOp, SI);
+        Result = VisitUnaryOperator(UnOp, SI, Builder);
       else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
-        Result = VisitIntrinsicInst(Intr, SI);
+        Result = VisitIntrinsicInst(Intr, SI, Builder);
       else if (auto *Select = dyn_cast<SelectInst>(Inst))
-        Result = VisitSelectInst(Select, SI);
+        Result = VisitSelectInst(Select, SI, Builder);
       else if (match(Inst, m_Load(m_Value(Op1))))
-        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
+        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
-        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2);
+        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);
       else
         continue;
 
-      IRBuilder<> Builder(Inst);
       finalizeLowering(Inst, Result, Builder);
       Changed = true;
     }
@@ -1204,22 +1204,22 @@ class LowerMatrixIntrinsics {
   }
 
   /// Replace intrinsic calls.
-  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     assert(Inst->getCalledFunction() &&
            Inst->getCalledFunction()->isIntrinsic());
 
     switch (Inst->getCalledFunction()->getIntrinsicID()) {
     case Intrinsic::matrix_multiply:
-      return LowerMultiply(Inst);
+      return LowerMultiply(Inst, Builder);
     case Intrinsic::matrix_transpose:
-      return LowerTranspose(Inst);
+      return LowerTranspose(Inst, Builder);
     case Intrinsic::matrix_column_major_load:
-      return LowerColumnMajorLoad(Inst);
+      return LowerColumnMajorLoad(Inst, Builder);
     case Intrinsic::matrix_column_major_store:
-      return LowerColumnMajorStore(Inst);
+      return LowerColumnMajorStore(Inst, Builder);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
-      IRBuilder<> Builder(Inst);
       MatrixTy Result;
       MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
@@ -1298,7 +1298,6 @@ class LowerMatrixIntrinsics {
                       ShapeInfo MatrixShape, Value *I, Value *J,
                       ShapeInfo ResultShape, Type *EltTy,
                       IRBuilder<> &Builder) {
-
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
@@ -1313,8 +1312,8 @@ class LowerMatrixIntrinsics {
 
   /// Lower a load instruction with shape information.
   MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,
-                     Value *Stride, bool IsVolatile, ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                     Value *Stride, bool IsVolatile, ShapeInfo Shape,
+                     IRBuilder<> &Builder) {
     return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,
                       Builder);
   }
@@ -1322,14 +1321,14 @@ class LowerMatrixIntrinsics {
   /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  MatrixTy LowerColumnMajorLoad(CallInst *Inst) {
+  MatrixTy LowerColumnMajorLoad(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
     return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
                      cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
-                     {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+                     {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);
   }
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
@@ -1374,8 +1373,7 @@ class LowerMatrixIntrinsics {
   /// Lower a store instruction with shape information.
   MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,
                       MaybeAlign A, Value *Stride, bool IsVolatile,
-                      ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                      ShapeInfo Shape, IRBuilder<> &Builder) {
     auto StoreVal = getMatrix(Matrix, Shape, Builder);
     return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,
                        Builder);
@@ -1384,7 +1382,7 @@ class LowerMatrixIntrinsics {
   /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  MatrixTy LowerColumnMajorStore(CallInst *Inst) {
+  MatrixTy LowerColumnMajorStore(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
@@ -1392,7 +1390,8 @@ class LowerMatrixIntrinsics {
     Value *Stride = Inst->getArgOperand(2);
     return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
                       cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
-                      {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+                      {Inst->getArgOperand(4), Inst->getArgOperand(5)},
+                      Builder);
   }
 
   // Set elements I..I+NumElts-1 to Block
@@ -2167,8 +2166,7 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lowers llvm.matrix.multiply.
-  MatrixTy LowerMultiply(CallInst *MatMul) {
-    IRBuilder<> Builder(MatMul);
+  MatrixTy LowerMultiply(CallInst *MatMul, IRBuilder<> &Builder) {
     auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
     ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
@@ -2193,9 +2191,8 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lowers llvm.matrix.transpose.
-  MatrixTy LowerTranspose(CallInst *Inst) {
+  MatrixTy LowerTranspose(CallInst *Inst, IRBuilder<> &Builder) {
     MatrixTy Result;
-    IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
     FixedVectorType *VectorTy = cast<FixedVectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
@@ -2228,26 +2225,26 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower load instructions.
-  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
+                     IRBuilder<> &Builder) {
     return LowerLoad(Inst, Ptr, Inst->getAlign(),
-                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                     Builder);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
-                      Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+                      Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                      Builder);
   }
 
   /// Lower binary operators.
-  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,
+                               IRBuilder<> &Builder) {
     Value *Lhs = Inst->getOperand(0);
     Value *Rhs = Inst->getOperand(1);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(Lhs, SI, Builder);
     MatrixTy B = getMatrix(Rhs, SI, Builder);
@@ -2265,11 +2262,10 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower unary operators.
-  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, SI, Builder);
 
@@ -2293,11 +2289,10 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower cast instructions.
-  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape,
+                                IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, Shape, Builder);
 
@@ -2315,13 +2310,12 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower selects.
-  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape,
+                           IRBuilder<> &Builder) {
     Value *Cond = Inst->getOperand(0);
     Value *OpA = Inst->getOperand(1);
     Value *OpB = Inst->getOperand(2);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(OpA, Shape, Builder);
     MatrixTy B = getMatrix(OpB, Shape, Builder);
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d20378ece4eea..a09303bb4469f 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index cb202f5f71b91..f053e202655be 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -296,10 +296,6 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   };
   auto AllPrecedingUsesFromBlockHoisted =
       [&HasNoUnhoistedInstr](const User *U) {
-        // Do not hoist any debug info intrinsics.
-        if (isa<DbgInfoIntrinsic>(U))
-          return false;
-
         return HasNoUnhoistedInstr(U->operand_values());
       };
 
@@ -313,9 +309,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
         return false;  // too much to hoist
     } else {
-      // Debug info intrinsics should not be counted for threshold.
-      if (!isa<DbgInfoIntrinsic>(I))
-        NotHoistedInstCount++;
+      NotHoistedInstCount++;
       if (NotHoistedInstCount > SpecExecMaxNotHoisted)
         return false; // too much left behind
       NotHoisted.insert(&I);
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index c71c5a70a12fd..e7d989a43840d 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -239,8 +239,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
       // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
       // considered accessing memory and will be marked as a tail call if we
       // don't bail out here.
-      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
-          isa<PseudoProbeInst>(&I))
+      if (!CI || CI->isTailCall() || isa<PseudoProbeInst>(&I))
         continue;
 
       // Bail out for intrinsic stackrestore call because it can modify
@@ -335,9 +334,6 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
 /// instructions between the call and this instruction are movable.
 ///
 static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
-  if (isa<DbgInfoIntrinsic>(I))
-    return true;
-
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
         llvm::findAllocaForValue(II->getArgOperand(1)))
@@ -396,12 +392,6 @@ static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
   return true;
 }
 
-static Instruction *firstNonDbg(BasicBlock::iterator I) {
-  while (isa<DbgInfoIntrinsic>(I))
-    ++I;
-  return &*I;
-}
-
 namespace {
 class TailRecursionEliminator {
   Function &F;
@@ -493,9 +483,8 @@ CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F.getEntryBlock() &&
-      firstNonDbg(BB->front().getIterator()) == CI &&
-      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+  if (BB == &F.getEntryBlock() && &BB->front() == CI &&
+      &*std::next(BB->begin()) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 1feed14b4fed8..98c65ae11b1c3 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -28,11 +28,10 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 1210bdf4a1c98..9883974c55e3b 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -471,10 +471,6 @@ CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
         Info.LifeEnd = IntrInst;
         continue;
       }
-      // At this point, permit debug uses outside of the region.
-      // This is fixed in a later call to fixupDebugInfoPostExtraction().
-      if (isa<DbgInfoIntrinsic>(IntrInst))
-        continue;
     }
     // Find untracked uses of the address, bail.
     if (!definedInRegion(Blocks, U))
@@ -1077,10 +1073,6 @@ static void applyFirstDebugLoc(Function *oldFunction,
       return any_of(*BB, [&BranchI](const Instruction &I) {
         if (!I.getDebugLoc())
           return false;
-        // Don't use source locations attached to debug-intrinsics: they could
-        // be from completely unrelated scopes.
-        if (isa<DbgInfoIntrinsic>(I))
-          return false;
         BranchI->setDebugLoc(I.getDebugLoc());
         return true;
       });
@@ -1329,7 +1321,6 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   //  2) They need to point to fresh metadata, e.g. because they currently
   //     point to a variable in the wrong scope.
   SmallDenseMap<DINode *, DINode *> RemappedMetadata;
-  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
   SmallVector<DbgVariableRecord *, 4> DVRsToDelete;
   DenseMap<const MDNode *, MDNode *> Cache;
 
@@ -1370,55 +1361,29 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       }
 
       DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
-      // Apply the two updates that dbg.values get: invalid operands, and
-      // variable metadata fixup.
+      // If any of the used locations are invalid, delete the record.
       if (any_of(DVR.location_ops(), IsInvalidLocation)) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // DbgAssign intrinsics have an extra Value argument:
       if (DVR.isDbgAssign() && IsInvalidLocation(DVR.getAddress())) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // If the variable was in the scope of the old function, i.e. it was not
+      // inlined, point the intrinsic to a fresh variable within the new
+      // function.
       if (!DVR.getDebugLoc().getInlinedAt())
         DVR.setVariable(GetUpdatedDIVariable(DVR.getVariable()));
     }
   };
 
-  for (Instruction &I : instructions(NewFunc)) {
+  for (Instruction &I : instructions(NewFunc))
     UpdateDbgRecordsOnInst(I);
 
-    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
-    if (!DII)
-      continue;
-
-    // Point the intrinsic to a fresh label within the new function if the
-    // intrinsic was not inlined from some other function.
-    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      UpdateDbgLabel(DLI);
-      continue;
-    }
-
-    auto *DVI = cast<DbgVariableIntrinsic>(DII);
-    // If any of the used locations are invalid, delete the intrinsic.
-    if (any_of(DVI->location_ops(), IsInvalidLocation)) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // DbgAssign intrinsics have an extra Value argument:
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-        DAI && IsInvalidLocation(DAI->getAddress())) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // If the variable was in the scope of the old function, i.e. it was not
-    // inlined, point the intrinsic to a fresh variable within the new function.
-    if (!DVI->getDebugLoc().getInlinedAt())
-      DVI->setVariable(GetUpdatedDIVariable(DVI->getVariable()));
-  }
-
-  for (auto *DII : DebugIntrinsicsToDelete)
-    DII->eraseFromParent();
   for (auto *DVR : DVRsToDelete)
     DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 729813a92f516..c2dbdc57eb3b5 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -299,7 +299,7 @@ bool llvm::stripDebugifyMetadata(Module &M) {
 
 bool hasLoc(const Instruction &I) {
   const DILocation *Loc = I.getDebugLoc().get();
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   DebugLocKind Kind = I.getDebugLoc().getKind();
   return Loc || Kind != DebugLocKind::Normal;
 #else
@@ -353,7 +353,7 @@ bool llvm::collectDebugInfoMetadata(Module &M,
 
         // Cllect dbg.values and dbg.declare.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -368,14 +368,8 @@ bool llvm::collectDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
         DebugInfoBeforePass.InstToDelete.insert({&I, &I});
 
@@ -597,7 +591,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
 
         // Collect dbg.values and dbg.declares.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -612,14 +606,8 @@ bool llvm::checkDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
 
         DebugInfoAfterPass.DILocations.insert({&I, hasLoc(I)});
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index d1db2ee29f3a2..3a5c7a3b1738e 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -353,13 +353,6 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallBase &CB = *cast<CallBase>(&*CurInst);
 
-      // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CB)) {
-        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
-        ++CurInst;
-        continue;
-      }
-
       // Cannot handle inline asm.
       if (CB.isInlineAsm()) {
         LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f47c467d15140..7df5e9958182c 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1927,16 +1927,11 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
       }
     }
 
-    // Remove debug info intrinsics if we're not keeping inline info.
+    // Remove debug info records if we're not keeping inline info.
     if (NoInlineLineTables) {
       BasicBlock::iterator BI = FI->begin();
       while (BI != FI->end()) {
-        if (isa<DbgInfoIntrinsic>(BI)) {
-          BI = BI->eraseFromParent();
-          continue;
-        } else {
-          BI->dropDbgRecords();
-        }
+        BI->dropDbgRecords();
         ++BI;
       }
     }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a3252a69874d3..f5208d50c6aae 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -49,7 +49,6 @@
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
@@ -2849,10 +2848,8 @@ bool llvm::handleUnreachableTerminator(
   return Changed;
 }
 
-std::pair<unsigned, unsigned>
-llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
-  unsigned NumDeadDbgInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
   // having to update as many def-use and use-def chains.
   Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
@@ -2871,15 +2868,12 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
       EndInst = Inst;
       continue;
     }
-    if (isa<DbgInfoIntrinsic>(Inst))
-      ++NumDeadDbgInst;
-    else
-      ++NumDeadInst;
+    ++NumDeadInst;
     // RemoveDIs: erasing debug-info must be done manually.
     Inst->dropDbgRecords();
     Inst->eraseFromParent();
   }
-  return {NumDeadInst, NumDeadDbgInst};
+  return NumDeadInst;
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f343962548259..27e70c5ddc0fc 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -545,8 +545,11 @@ countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
-                                   Pred))
+                                   Pred)) {
+      if (shouldPeelLastIteration(L, Pred, AddRec, BoundSCEV, SE, TTI))
+        DesiredPeelCountLast = 1;
       return;
+    }
     DesiredPeelCount = NewPeelCount;
   };
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 6b42503b2e015..66d0573e83f65 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -547,36 +547,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // possible or create a clone in the OldPreHeader if not.
     Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
 
-    // Record all debug intrinsics preceding LoopEntryBranch to avoid
+    // Record all debug records preceding LoopEntryBranch to avoid
     // duplication.
-    using DbgIntrinsicHash =
+    using DbgHash =
         std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](auto *D) -> DbgIntrinsicHash {
+    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
       auto VarLocOps = D->location_ops();
       return {{hash_combine_range(VarLocOps), D->getVariable()},
               D->getExpression()};
     };
 
-    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-    for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-        DbgIntrinsics.insert(makeHash(DII));
-        // Until RemoveDIs supports dbg.declares in DbgVariableRecord format,
-        // we'll need to collect DbgVariableRecords attached to any other debug
-        // intrinsics.
-        for (const DbgVariableRecord &DVR :
-             filterDbgVars(DII->getDbgRecordRange()))
-          DbgIntrinsics.insert(makeHash(&DVR));
-      } else {
-        break;
-      }
-    }
-
+    SmallDenseSet<DbgHash, 8> DbgRecords;
     // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator, which isn't considered in the loop above.
+    // terminator.
     for (const DbgVariableRecord &DVR :
          filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgIntrinsics.insert(makeHash(&DVR));
+      DbgRecords.insert(makeHash(&DVR));
 
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
@@ -623,7 +609,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // memory (without proving that the loop doesn't write).
       if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
           !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst) &&
+          !isa<AllocaInst>(Inst) &&
           // It is not safe to hoist the value of these instructions in
           // coroutines, as the addresses of otherwise eligible variables (e.g.
           // thread-local variables and errno) may change if the coroutine is
@@ -642,7 +628,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           // Erase anything we've seen before.
           for (DbgVariableRecord &DVR :
                make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgIntrinsics.count(makeHash(&DVR)))
+            if (DbgRecords.count(makeHash(&DVR)))
               DVR.eraseFromParent();
         }
 
@@ -671,7 +657,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
              make_early_inc_range(filterDbgVars(Range)))
-          if (DbgIntrinsics.count(makeHash(&DVR)))
+          if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
@@ -679,13 +665,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       RemapInstruction(C, ValueMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-      // Avoid inserting the same intrinsic twice.
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
-        if (DbgIntrinsics.count(makeHash(DII))) {
-          C->eraseFromParent();
-          continue;
-        }
-
       // With the operands remapped, see if the instruction constant folds or is
       // otherwise simplifyable.  This commonly occurs because the entry from PHI
       // nodes allows icmps and other instructions to fold.
@@ -806,7 +785,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
                                     &InsertedPHIs);
 
-    // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+    // Attach debug records to the new phis if that phi uses a value that
     // previously had debug metadata attached. This keeps the debug info
     // up-to-date in the loop body.
     if (!InsertedPHIs.empty())
@@ -952,9 +931,6 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
     if (!isSafeToSpeculativelyExecute(&*I))
       return false;
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
     switch (I->getOpcode()) {
     default:
       return false;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 70afd4133df7c..24fe08d6c3e4e 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -182,8 +182,7 @@ SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const {
     BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
     while ((isa<BitCastInst>(IP) &&
             isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
-            cast<BitCastInst>(IP)->getOperand(0) != A) ||
-           isa<DbgInfoIntrinsic>(IP))
+            cast<BitCastInst>(IP)->getOperand(0) != A))
       ++IP;
     return IP;
   }
@@ -278,11 +277,6 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
-
       auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
         // Ensure that no-wrap flags match.
         if (isa<OverflowingBinaryOperator>(I)) {
@@ -382,10 +376,6 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
       if (auto *GEP = dyn_cast<GetElementPtrInst>(IP)) {
         if (GEP->getPointerOperand() == V &&
             GEP->getSourceElementType() == Builder.getInt8Ty() &&
@@ -1545,8 +1535,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
           InsertPt = L->getHeader()->getFirstInsertionPt();
 
         while (InsertPt != Builder.GetInsertPoint() &&
-               (isInsertedInstruction(&*InsertPt) ||
-                isa<DbgInfoIntrinsic>(&*InsertPt))) {
+               (isInsertedInstruction(&*InsertPt))) {
           InsertPt = std::next(InsertPt);
         }
         break;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0980f0e57aa6d..eb52c1b7e6fba 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1130,17 +1130,14 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     Instruction *NewBonusInst = BonusInst.clone();
 
-    if (!isa<DbgInfoIntrinsic>(BonusInst)) {
-      if (!NewBonusInst->getDebugLoc().isSameSourceLocation(
-              PTI->getDebugLoc())) {
-        // Unless the instruction has the same !dbg location as the original
-        // branch, drop it. When we fold the bonus instructions we want to make
-        // sure we reset their debug locations in order to avoid stepping on
-        // dead code caused by folding dead branches.
-        NewBonusInst->setDebugLoc(DebugLoc::getDropped());
-      } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
-        mapAtomInstance(DL, VMap);
-      }
+    if (!NewBonusInst->getDebugLoc().isSameSourceLocation(PTI->getDebugLoc())) {
+      // Unless the instruction has the same !dbg location as the original
+      // branch, drop it. When we fold the bonus instructions we want to make
+      // sure we reset their debug locations in order to avoid stepping on
+      // dead code caused by folding dead branches.
+      NewBonusInst->setDebugLoc(DebugLoc::getDropped());
+    } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
+      mapAtomInstance(DL, VMap);
     }
 
     RemapInstruction(NewBonusInst, VMap,
@@ -1158,9 +1155,6 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     RemapDbgRecordRange(NewBonusInst->getModule(), Range, VMap,
                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-    if (isa<DbgInfoIntrinsic>(BonusInst))
-      continue;
-
     NewBonusInst->takeName(&BonusInst);
     BonusInst.setName(NewBonusInst->getName() + ".old");
     VMap[&BonusInst] = NewBonusInst;
@@ -1903,21 +1897,6 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     Instruction *I1 = &*BB1ItrPair.first;
 
-    // Skip debug info if it is not identical.
-    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
-      Instruction *I2 = &*Iter;
-      return I1->isIdenticalToWhenDefined(I2);
-    });
-    if (!AllDbgInstsAreIdentical) {
-      while (isa<DbgInfoIntrinsic>(I1))
-        I1 = &*++BB1ItrPair.first;
-      for (auto &SuccIter : OtherSuccIterRange) {
-        Instruction *I2 = &*SuccIter;
-        while (isa<DbgInfoIntrinsic>(I2))
-          I2 = &*++SuccIter;
-      }
-    }
-
     bool AllInstsAreIdentical = true;
     bool HasTerminator = I1->isTerminator();
     for (auto &SuccIter : OtherSuccIterRange) {
@@ -1965,49 +1944,33 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     if (AllInstsAreIdentical) {
       BB1ItrPair.first++;
-      if (isa<DbgInfoIntrinsic>(I1)) {
-        // The debug location is an integral part of a debug info intrinsic
-        // and can't be separated from it or replaced.  Instead of attempting
-        // to merge locations, simply hoist both copies of the intrinsic.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          auto *I2 = &*SuccIter++;
-          assert(isa<DbgInfoIntrinsic>(I2));
-          I2->moveBefore(TI->getIterator());
+      // For a normal instruction, we just move one to right before the
+      // branch, then replace all uses of the other with the first.  Finally,
+      // we remove the now redundant second instruction.
+      hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+      // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+      // and leave any that were not hoisted behind (by calling moveBefore
+      // rather than moveBeforePreserving).
+      I1->moveBefore(TI->getIterator());
+      for (auto &SuccIter : OtherSuccIterRange) {
+        Instruction *I2 = &*SuccIter++;
+        assert(I2 != I1);
+        if (!I2->use_empty())
+          I2->replaceAllUsesWith(I1);
+        I1->andIRFlags(I2);
+        if (auto *CB = dyn_cast<CallBase>(I1)) {
+          bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
+          assert(Success && "We should not be trying to hoist callbases "
+                            "with non-intersectable attributes");
+          // For NDEBUG Compile.
+          (void)Success;
         }
-      } else {
-        // For a normal instruction, we just move one to right before the
-        // branch, then replace all uses of the other with the first.  Finally,
-        // we remove the now redundant second instruction.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          Instruction *I2 = &*SuccIter++;
-          assert(I2 != I1);
-          if (!I2->use_empty())
-            I2->replaceAllUsesWith(I1);
-          I1->andIRFlags(I2);
-          if (auto *CB = dyn_cast<CallBase>(I1)) {
-            bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
-            assert(Success && "We should not be trying to hoist callbases "
-                              "with non-intersectable attributes");
-            // For NDEBUG Compile.
-            (void)Success;
-          }
 
-          combineMetadataForCSE(I1, I2, true);
-          // I1 and I2 are being combined into a single instruction.  Its debug
-          // location is the merged locations of the original instructions.
-          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-          I2->eraseFromParent();
-        }
+        combineMetadataForCSE(I1, I2, true);
+        // I1 and I2 are being combined into a single instruction.  Its debug
+        // location is the merged locations of the original instructions.
+        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+        I2->eraseFromParent();
       }
       if (!Changed)
         NumHoistCommonCode += SuccIterPairs.size();
@@ -2297,11 +2260,8 @@ static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   SmallVector<Instruction*,4> Insts;
   for (auto *BB : Blocks) {
     Instruction *I = BB->getTerminator();
-    do {
-      I = I->getPrevNode();
-    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
-    if (!isa<DbgInfoIntrinsic>(I))
-      Insts.push_back(I);
+    I = I->getPrevNode();
+    Insts.push_back(I);
   }
 
   // We don't need to do any more checking here; canSinkInstructions should
@@ -3234,7 +3194,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // - All of their uses are in ThenBB.
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
-  SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
+  SmallVector<Instruction *, 4> SpeculatedPseudoProbes;
 
   unsigned SpeculatedInstructions = 0;
   bool HoistLoadsStores = Options.HoistLoadsStoresWithCondFaulting;
@@ -3243,12 +3203,6 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   StoreInst *SpeculatedStore = nullptr;
   EphemeralValueTracker EphTracker;
   for (Instruction &I : reverse(drop_end(*ThenBB))) {
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) {
-      SpeculatedDbgIntrinsics.push_back(&I);
-      continue;
-    }
-
     // Skip pseudo probes. The consequence is we lose track of the branch
     // probability for ThenBB, which is fine since the optimization here takes
     // place regardless of the branch probability.
@@ -3257,7 +3211,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
       // the samples collected on the non-conditional path are counted towards
       // the conditional path. We leave it for the counts inference algorithm to
       // figure out a proper count for an unknown probe.
-      SpeculatedDbgIntrinsics.push_back(&I);
+      SpeculatedPseudoProbes.push_back(&I);
       continue;
     }
 
@@ -3388,9 +3342,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // hoisting above.
   for (auto &I : make_early_inc_range(*ThenBB)) {
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
-      // Don't update the DILocation of dbg.assign intrinsics.
-      if (!isa<DbgAssignIntrinsic>(&I))
-        I.setDebugLoc(DebugLoc::getDropped());
+      I.setDebugLoc(DebugLoc::getDropped());
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -3402,9 +3354,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   }
 
   // Hoist the instructions.
-  // In "RemoveDIs" non-instr debug-info mode, drop DbgVariableRecords attached
-  // to these instructions, in the same way that dbg.value intrinsics are
-  // dropped at the end of this block.
+  // Drop DbgVariableRecords attached to these instructions.
   for (auto &It : *ThenBB)
     for (DbgRecord &DR : make_early_inc_range(It.getDbgRecordRange()))
       // Drop all records except assign-kind DbgVariableRecords (dbg.assign
@@ -3442,15 +3392,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     PN.setIncomingValue(ThenI, V);
   }
 
-  // Remove speculated dbg intrinsics.
-  // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
-  // dbg value for the different flows and inserting it after the select.
-  for (Instruction *I : SpeculatedDbgIntrinsics) {
-    // We still want to know that an assignment took place so don't remove
-    // dbg.assign intrinsics.
-    if (!isa<DbgAssignIntrinsic>(I))
-      I->eraseFromParent();
-  }
+  // Remove speculated pseudo probes.
+  for (Instruction *I : SpeculatedPseudoProbes)
+    I->eraseFromParent();
 
   ++NumSpeculations;
   return true;
@@ -4162,8 +4106,8 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     // Don't check the branch condition comparison itself.
     if (&I == Cond)
       continue;
-    // Ignore dbg intrinsics, and the terminator.
-    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+    // Ignore the terminator.
+    if (isa<BranchInst>(I))
       continue;
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
@@ -7762,8 +7706,7 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     BranchInst *BI2 = dyn_cast<BranchInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
@@ -7784,12 +7727,6 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       }
     }
 
-    // The debug info in OtherPred doesn't cover the merged control flow that
-    // used to go through BB.  We need to delete it or update it.
-    for (Instruction &Inst : llvm::make_early_inc_range(*OtherPred))
-      if (isa<DbgInfoIntrinsic>(Inst))
-        Inst.eraseFromParent();
-
     SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : UniqueSuccs) {
       Succ->removePredecessor(BB);
@@ -7837,8 +7774,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
-      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-        ;
+      ++I;
       if (I->isTerminator() &&
           tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
         return true;
@@ -7847,8 +7783,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     if (I->isTerminator() && tryToMergeLandingPad(LPad, BI, BB, DTU))
       return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8e09e6f8d4935..0c4e5bb3d4721 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -896,13 +896,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       } // end of PHI handling
 
       // We handle calls that:
-      //   * Are debug info intrinsics.
       //   * Have a mapping to an IR intrinsic.
       //   * Have a vector version available.
       auto *CI = dyn_cast<CallInst>(&I);
 
       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
-          !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
             (!VFDatabase::getMappings(*CI).empty() ||
              isTLIScalarize(*TLI, *CI)))) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bd0a2ec3986d3..16d48b06dce41 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2764,8 +2764,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
 
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
-  if (EnableVPlanNativePath)
-    fixNonInductionPHIs(State);
+  fixNonInductionPHIs(State);
 
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
@@ -6164,11 +6163,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
     // First-order recurrences are replaced by vector shuffles inside the loop.
     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
-      // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
-      // penultimate value of the recurrence.
-      // TODO: Consider vscale_range info.
-      if (VF.isScalable() && VF.getKnownMinValue() == 1)
-        return InstructionCost::getInvalid();
       SmallVector<int> Mask(VF.getKnownMinValue());
       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
@@ -7324,7 +7318,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
          "Trying to execute plan with unsupported VF");
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
-  VPlanTransforms::runPass(VPlanTransforms::materializeStepVectors, BestVPlan);
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
@@ -8558,13 +8551,17 @@ addUsersInExitBlocks(VPlan &Plan,
 /// users in the original exit block using the VPIRInstruction wrapping to the
 /// LCSSA phi.
 static void addExitUsersForFirstOrderRecurrences(
-    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
+    VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   auto *ScalarPHVPBB = Plan.getScalarPreheader();
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder ScalarPHBuilder(ScalarPHVPBB);
   VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
 
+  auto IsScalableOne = [](ElementCount VF) -> bool {
+    return VF == ElementCount::getScalable(1);
+  };
+
   for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
     auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
     if (!FOR)
@@ -8646,6 +8643,15 @@ static void addExitUsersForFirstOrderRecurrences(
     for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
       if (ExitIRI->getOperand(0) != FOR)
         continue;
+      // For VF vscale x 1, if vscale = 1, we are unable to extract the
+      // penultimate value of the recurrence. Instead, we rely on function
+      // addUsersInExitBlocks to extract the last element from the result of
+      // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
+      // recurrence phi in ExitUsersToFix.
+      // TODO: Consider vscale_range info and UF.
+      if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
+                                                             Range))
+        return;
       VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
           VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
           {}, "vector.recur.extract.for.phi");
@@ -8860,7 +8866,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
   SetVector<VPIRInstruction *> ExitUsersToFix =
       collectUsersInLatchExitBlock(*Plan);
-  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
+  addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix, Range);
   addUsersInExitBlocks(*Plan, ExitUsersToFix);
 
   // ---------------------------------------------------------------------------
@@ -9765,7 +9771,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
                 match(
                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
                     m_SpecificInt(0)) &&
-                is_contained(P.incoming_values(), EPI.VectorTripCount))
+                all_of(P.incoming_values(), [&EPI](Value *Inc) {
+                  return Inc == EPI.VectorTripCount ||
+                         match(Inc, m_SpecificInt(0));
+                }))
               return &P;
             return nullptr;
           });
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c3ca22dce0cc4..4551a365a6967 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -511,15 +511,25 @@ static bool isSplat(ArrayRef<Value *> VL) {
 }
 
 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
-static bool isCommutative(Instruction *I) {
+/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
+/// patterns that make it effectively commutative (like equality comparisons
+/// with zero).
+/// In most cases, users should not call this function directly (since \p I and
+/// \p InstWithUses are the same). However, when analyzing interchangeable
+/// instructions, we need to use the converted opcode along with the original
+/// uses.
+/// \param I The instruction to check for commutativity
+/// \param InstWithUses The instruction whose uses are analyzed for special
+/// patterns
+static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
   if (auto *Cmp = dyn_cast<CmpInst>(I))
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative() ||
            (BO->getOpcode() == Instruction::Sub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(
-                BO->uses(),
+                InstWithUses->uses(),
                 [](const Use &U) {
                   // Commutative, if icmp eq/ne sub, 0
                   CmpPredicate Pred;
@@ -536,14 +546,24 @@ static bool isCommutative(Instruction *I) {
                           Flag->isOne());
                 })) ||
            (BO->getOpcode() == Instruction::FSub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
-            all_of(BO->uses(), [](const Use &U) {
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+            all_of(InstWithUses->uses(), [](const Use &U) {
               return match(U.getUser(),
                            m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
             }));
   return I->isCommutative();
 }
 
+/// This is a helper function to check whether \p I is commutative.
+/// This is a convenience wrapper that calls the two-parameter version of
+/// isCommutative with the same instruction for both parameters. This is
+/// the common case where the instruction being checked for commutativity
+/// is the same as the instruction whose uses are analyzed for special
+/// patterns (see the two-parameter version above for details).
+/// \param I The instruction to check for commutativity
+/// \returns true if the instruction is commutative, false otherwise
+static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
+
 template <typename T>
 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
                                                      unsigned Offset) {
@@ -2898,7 +2918,11 @@ class BoUpSLP {
           continue;
         }
         auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
-        bool IsInverseOperation = !isCommutative(SelectedOp);
+        // We cannot check commutativity by the converted instruction
+        // (SelectedOp) because isCommutative also examines def-use
+        // relationships.
+        bool IsInverseOperation =
+            !isCommutative(SelectedOp, cast<Instruction>(V));
         for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
           OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
@@ -12061,7 +12085,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
-                                        SmallVectorImpl<unsigned> &Indices)
+                                        SmallVectorImpl<unsigned> &Indices,
+                                        SmallVectorImpl<unsigned> &SubVecSizes)
         -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
@@ -12106,7 +12131,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return std::min(S, I);
                     }),
                 EltsPerVector);
-            Indices.push_back(OffsetReg1 % NumElts);
+            unsigned Index = OffsetReg1 % NumElts;
+            Indices.push_back(Index);
+            SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
           }
           Idx = I - OffsetReg1;
         }
@@ -12128,8 +12155,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
       SmallVector<unsigned, 2> Indices;
+      SmallVector<unsigned, 2> SubVecSizes;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask, Indices);
+          CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
       if (!RegShuffleKind) {
         if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
             !ShuffleVectorInst::isIdentityMask(
@@ -12147,12 +12175,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       const unsigned BaseVF = getFullVectorNumberOfElements(
           *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
-      for (unsigned Idx : Indices) {
-        assert((Idx + EltsPerVector) <= BaseVF &&
+      for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
+        assert((Idx + SubVecSize) <= BaseVF &&
                "SK_ExtractSubvector index out of range");
         Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
                                  getWidenedType(ScalarTy, BaseVF), {}, CostKind,
-                                 Idx, getWidenedType(ScalarTy, EltsPerVector));
+                                 Idx, getWidenedType(ScalarTy, SubVecSize));
       }
       // Second attempt to check, if just a permute is better estimated than
       // subvector extract.
@@ -14886,25 +14914,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
 
   Cost += ExtractCost;
   auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
-                                    bool) {
+                                    bool ForSingleMask) {
     InstructionCost C = 0;
     unsigned VF = Mask.size();
     unsigned VecVF = TE->getVectorFactor();
-    if (VF != VecVF &&
-        (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
-         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
-      SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
-      std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
-                OrigMask.begin());
-      C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
-                           getWidenedType(TE->getMainOp()->getType(), VecVF),
-                           OrigMask);
-      LLVM_DEBUG(
-          dbgs() << "SLP: Adding cost " << C
-                 << " for final shuffle of insertelement external users.\n";
-          TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
-      Cost += C;
-      return std::make_pair(TE, true);
+    bool HasLargeIndex =
+        any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
+    if ((VF != VecVF && HasLargeIndex) ||
+        !ShuffleVectorInst::isIdentityMask(Mask, VF)) {
+
+      if (HasLargeIndex) {
+        SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
+        std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+                  OrigMask.begin());
+        C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+                             getWidenedType(TE->getMainOp()->getType(), VecVF),
+                             OrigMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+        Cost += C;
+        return std::make_pair(TE, true);
+      }
+
+      if (!ForSingleMask) {
+        SmallVector<int> ResizeMask(VF, PoisonMaskElem);
+        for (unsigned I = 0; I < VF; ++I) {
+          if (Mask[I] != PoisonMaskElem)
+            ResizeMask[Mask[I]] = Mask[I];
+        }
+        if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
+          C = ::getShuffleCost(
+              *TTI, TTI::SK_PermuteSingleSrc,
+              getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+
+        Cost += C;
+      }
     }
     return std::make_pair(TE, false);
   };
@@ -21191,25 +21241,30 @@ bool SLPVectorizerPass::vectorizeStores(
         ++Repeat;
         bool RepeatChanged = false;
         bool AnyProfitableGraph = false;
-        for (unsigned Size : CandidateVFs) {
+        for (unsigned VF : CandidateVFs) {
           AnyProfitableGraph = false;
-          unsigned StartIdx = std::distance(
-              RangeSizes.begin(),
-              find_if(RangeSizes,
-                      std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
-          while (StartIdx < End) {
-            unsigned EndIdx = std::distance(
+          unsigned FirstUnvecStore =
+              std::distance(RangeSizes.begin(),
+                            find_if(RangeSizes, std::bind(IsNotVectorized,
+                                                          VF >= MaxRegVF, _1)));
+
+          // Form slices of size VF starting from FirstUnvecStore and try to
+          // vectorize them.
+          while (FirstUnvecStore < End) {
+            unsigned FirstVecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(StartIdx),
-                        std::bind(IsVectorized, Size >= MaxRegVF, _1)));
-            unsigned Sz = EndIdx >= End ? End : EndIdx;
-            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
-              if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
-                                  Size >= MaxRegVF)) {
-                ++Cnt;
+                find_if(RangeSizes.drop_front(FirstUnvecStore),
+                        std::bind(IsVectorized, VF >= MaxRegVF, _1)));
+            unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
+            for (unsigned SliceStartIdx = FirstUnvecStore;
+                 SliceStartIdx + VF <= MaxSliceEnd;) {
+              if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
+                                  VF >= MaxRegVF)) {
+                ++SliceStartIdx;
                 continue;
               }
-              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              ArrayRef<Value *> Slice =
+                  ArrayRef(Operands).slice(SliceStartIdx, VF);
               assert(all_of(Slice,
                             [&](Value *V) {
                               return cast<StoreInst>(V)
@@ -21223,19 +21278,23 @@ bool SLPVectorizerPass::vectorizeStores(
               if (!NonSchedulable.empty()) {
                 auto [NonSchedSizeMax, NonSchedSizeMin] =
                     NonSchedulable.lookup(Slice.front());
-                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
-                  Cnt += NonSchedSizeMax;
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
+                  // VF is too ambitious. Try to vectorize another slice before
+                  // trying a smaller VF.
+                  SliceStartIdx += NonSchedSizeMax;
                   continue;
                 }
               }
               unsigned TreeSize;
               std::optional<bool> Res =
-                  vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
               if (!Res) {
+                // Update the range of non schedulable VFs for slices starting
+                // at SliceStartIdx.
                 NonSchedulable
-                    .try_emplace(Slice.front(), std::make_pair(Size, Size))
+                    .try_emplace(Slice.front(), std::make_pair(VF, VF))
                     .first->getSecond()
-                    .second = Size;
+                    .second = VF;
               } else if (*Res) {
                 // Mark the vectorized stores so that we don't vectorize them
                 // again.
@@ -21246,63 +21305,67 @@ bool SLPVectorizerPass::vectorizeStores(
                 // If we vectorized initial block, no need to try to vectorize
                 // it again.
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size))
+                     RangeSizes.slice(SliceStartIdx, VF))
                   P.first = P.second = 0;
-                if (Cnt < StartIdx + MinVF) {
-                  for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(StartIdx, Cnt - StartIdx))
+                if (SliceStartIdx < FirstUnvecStore + MinVF) {
+                  for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
+                           FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
                     P.first = P.second = 0;
-                  StartIdx = Cnt + Size;
+                  FirstUnvecStore = SliceStartIdx + VF;
                 }
-                if (Cnt > Sz - Size - MinVF) {
+                if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
                   for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)))
+                       RangeSizes.slice(SliceStartIdx + VF,
+                                        MaxSliceEnd - (SliceStartIdx + VF)))
                     P.first = P.second = 0;
-                  if (Sz == End)
-                    End = Cnt;
-                  Sz = Cnt;
+                  if (MaxSliceEnd == End)
+                    End = SliceStartIdx;
+                  MaxSliceEnd = SliceStartIdx;
                 }
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
-              if (Size > 2 && Res &&
-                  !all_of(RangeSizes.slice(Cnt, Size),
-                          std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+              if (VF > 2 && Res &&
+                  !all_of(RangeSizes.slice(SliceStartIdx, VF),
+                          std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
                                     _1))) {
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
               // Check for the very big VFs that we're not rebuilding same
               // trees, just with larger number of elements.
-              if (Size > MaxRegVF && TreeSize > 1 &&
-                  all_of(RangeSizes.slice(Cnt, Size),
+              if (VF > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(SliceStartIdx, VF),
                          std::bind(FirstSizeSame, TreeSize, _1))) {
-                Cnt += Size;
-                while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
-                  ++Cnt;
+                SliceStartIdx += VF;
+                while (SliceStartIdx != MaxSliceEnd &&
+                       RangeSizes[SliceStartIdx].first == TreeSize)
+                  ++SliceStartIdx;
                 continue;
               }
-              if (TreeSize > 1)
+              if (TreeSize > 1) {
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size)) {
-                  if (Size >= MaxRegVF)
+                     RangeSizes.slice(SliceStartIdx, VF)) {
+                  if (VF >= MaxRegVF)
                     P.second = std::max(P.second, TreeSize);
                   else
                     P.first = std::max(P.first, TreeSize);
                 }
-              ++Cnt;
+              }
+              ++SliceStartIdx;
               AnyProfitableGraph = true;
             }
-            if (StartIdx >= End)
+            if (FirstUnvecStore >= End)
               break;
-            if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+            if (MaxSliceEnd - FirstUnvecStore < VF &&
+                MaxSliceEnd - FirstUnvecStore >= MinVF)
               AnyProfitableGraph = true;
-            StartIdx = std::distance(
+            FirstUnvecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(Sz),
-                        std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
+                find_if(RangeSizes.drop_front(MaxSliceEnd),
+                        std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
           }
-          if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
+          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
             break;
         }
         // All values vectorized - exit.
@@ -24297,9 +24360,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       continue;
     }
 
-    if (isa<DbgInfoIntrinsic>(It))
-      continue;
-
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(It)) {
       // Check that the PHI is a reduction PHI.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cca3d32c0783e..4332332ef5cc3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1029,17 +1029,11 @@ void VPlan::execute(VPTransformState *State) {
     if (isa<VPWidenPHIRecipe>(&R))
       continue;
 
-    if (isa<VPWidenInductionRecipe>(&R)) {
-      PHINode *Phi = nullptr;
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-      } else {
-        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-               "recipe generating only scalars should have been replaced");
-        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-        Phi = cast<PHINode>(GEP->getPointerOperand());
-      }
+    if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+      assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+             "recipe generating only scalars should have been replaced");
+      auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+      PHINode *Phi = cast<PHINode>(GEP->getPointerOperand());
 
       Phi->setIncomingBlock(1, VectorLatchBB);
 
@@ -1047,10 +1041,6 @@ void VPlan::execute(VPTransformState *State) {
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
       Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
-
-      // Use the steps for the last part as backedge value for the induction.
-      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a3c4a514a5dd..f3306ad7cb8ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1951,12 +1951,13 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector values.
+/// producing their vector values. This is an abstract recipe and must be
+/// converted to concrete recipes before executing.
 class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   TruncInst *Trunc;
 
   // If this recipe is unrolled it will have 2 additional operands.
-  bool isUnrolled() const { return getNumOperands() == 6; }
+  bool isUnrolled() const { return getNumOperands() == 5; }
 
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
@@ -1992,9 +1993,10 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
 
-  /// Generate the vectorized and scalarized versions of the phi node as
-  /// needed by their users.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("cannot execute this recipe, should be expanded via "
+                     "expandVPWidenIntOrFpInductionRecipe");
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -2005,16 +2007,6 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   VPValue *getVFValue() { return getOperand(2); }
   const VPValue *getVFValue() const { return getOperand(2); }
 
-  // TODO: Remove once VPWidenIntOrFpInduction is fully expanded in
-  // convertToConcreteRecipes.
-  VPInstructionWithType *getStepVector() {
-    auto *StepVector =
-        cast<VPInstructionWithType>(getOperand(3)->getDefiningRecipe());
-    assert(StepVector->getOpcode() == VPInstruction::StepVector &&
-           "step vector operand must be a VPInstruction::StepVector");
-    return StepVector;
-  }
-
   VPValue *getSplatVFValue() {
     // If the recipe has been unrolled return the VPValue for the induction
     // increment.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 048286d7a97bc..f3b5c8cfa9885 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -952,6 +952,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::Broadcast:
   case VPInstruction::ReductionStartVector:
     return true;
   case VPInstruction::PtrAdd:
@@ -1077,15 +1078,14 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 
 void VPInstructionWithType::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
-  switch (getOpcode()) {
-  case Instruction::ZExt:
-  case Instruction::Trunc: {
+  if (isScalarCast()) {
     Value *Op = State.get(getOperand(0), VPLane(0));
     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
                                            Op, ResultTy);
     State.set(this, Cast, VPLane(0));
-    break;
+    return;
   }
+  switch (getOpcode()) {
   case VPInstruction::StepVector: {
     Value *StepVector =
         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
@@ -1965,44 +1965,6 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 }
 
-/// This function adds
-/// (0 * Step, 1 * Step, 2 * Step, ...)
-/// to each vector element of Val.
-/// \p Opcode is relevant for FP induction variable.
-/// \p InitVec is an integer step vector from 0 with a step of 1.
-static Value *getStepVector(Value *Val, Value *Step, Value *InitVec,
-                            Instruction::BinaryOps BinOp, ElementCount VF,
-                            IRBuilderBase &Builder) {
-  assert(VF.isVector() && "only vector VFs are supported");
-
-  // Create and check the types.
-  auto *ValVTy = cast<VectorType>(Val->getType());
-  ElementCount VLen = ValVTy->getElementCount();
-
-  Type *STy = Val->getType()->getScalarType();
-  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
-         "Induction Step must be an integer or FP");
-  assert(Step->getType() == STy && "Step has wrong type");
-
-  if (STy->isIntegerTy()) {
-    Step = Builder.CreateVectorSplat(VLen, Step);
-    assert(Step->getType() == Val->getType() && "Invalid step vec");
-    // FIXME: The newly created binary instructions should contain nsw/nuw
-    // flags, which can be found from the original scalar operations.
-    Step = Builder.CreateMul(InitVec, Step);
-    return Builder.CreateAdd(Val, Step, "induction");
-  }
-
-  // Floating point induction.
-  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
-         "Binary Opcode should be specified for FP induction");
-  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
-
-  Step = Builder.CreateVectorSplat(VLen, Step);
-  Value *MulOp = Builder.CreateFMul(InitVec, Step);
-  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
 /// A helper function that returns an integer or floating-point constant with
 /// value C.
 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
@@ -2010,104 +1972,6 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
                            : ConstantFP::get(Ty, C);
 }
 
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Int or FP induction being replicated.");
-
-  Value *Start = getStartValue()->getLiveInIRValue();
-  const InductionDescriptor &ID = getInductionDescriptor();
-  TruncInst *Trunc = getTruncInst();
-  IRBuilderBase &Builder = State.Builder;
-  assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
-         "Types must match");
-  assert(State.VF.isVector() && "must have vector VF");
-
-  // The value from the original loop to which we are mapping the new induction
-  // variable.
-  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
-
-  // Fast-math-flags propagate from the original induction instruction.
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
-    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
-  // Now do the actual transformations, and start with fetching the step value.
-  Value *Step = State.get(getStepValue(), VPLane(0));
-
-  assert((isa<PHINode, TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // Construct the initial value of the vector IV in the vector loop preheader
-  auto CurrIP = Builder.saveIP();
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  Builder.SetInsertPoint(VectorPH->getTerminator());
-  if (isa<TruncInst>(EntryVal)) {
-    assert(Start->getType()->isIntegerTy() &&
-           "Truncation requires an integer type");
-    auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = Builder.CreateTrunc(Step, TruncType);
-    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
-  }
-
-  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
-  Value *SteppedStart =
-      ::getStepVector(SplatStart, Step, State.get(getStepVector()),
-                      ID.getInductionOpcode(), State.VF, State.Builder);
-
-  // We create vector phi nodes for both integer and floating-point induction
-  // variables. Here, we determine the kind of arithmetic we will perform.
-  Instruction::BinaryOps AddOp;
-  Instruction::BinaryOps MulOp;
-  if (Step->getType()->isIntegerTy()) {
-    AddOp = Instruction::Add;
-    MulOp = Instruction::Mul;
-  } else {
-    AddOp = ID.getInductionOpcode();
-    MulOp = Instruction::FMul;
-  }
-
-  Value *SplatVF;
-  if (VPValue *SplatVFOperand = getSplatVFValue()) {
-    // The recipe has been unrolled. In that case, fetch the splat value for the
-    // induction increment.
-    SplatVF = State.get(SplatVFOperand);
-  } else {
-    // Multiply the vectorization factor by the step using integer or
-    // floating-point arithmetic as appropriate.
-    Type *StepType = Step->getType();
-    Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
-    if (Step->getType()->isFloatingPointTy())
-      RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
-    else
-      RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
-    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-    // Create a vector splat to use in the induction update.
-    SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
-  }
-
-  Builder.restoreIP(CurrIP);
-
-  // We may need to add the step a number of times, depending on the unroll
-  // factor. The last of those goes into the PHI.
-  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
-  VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
-  VecInd->setDebugLoc(getDebugLoc());
-  State.set(this, VecInd);
-
-  Instruction *LastInduction = cast<Instruction>(
-      Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
-  LastInduction->setDebugLoc(getDebugLoc());
-
-  VecInd->addIncoming(SteppedStart, VectorPH);
-  // Add induction update using an incorrect block temporarily. The phi node
-  // will be fixed after VPlan execution. Note that at this point the latch
-  // block cannot be used, as it does not exist yet.
-  // TODO: Model increment value in VPlan, by turning the recipe into a
-  // multi-def and a subclass of VPHeaderPHIRecipe.
-  VecInd->addIncoming(LastInduction, VectorPH);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -3816,9 +3680,6 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
   if (VF.isScalar())
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
-  if (VF == ElementCount::getScalable(1))
-    return InstructionCost::getInvalid();
-
   return 0;
 }
 
@@ -3871,12 +3732,14 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  assert(EnableVPlanNativePath &&
-         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
-  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  // Manually move it with the other PHIs in case PHI recipes above this one
+  // also inserted non-phi instructions.
+  // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
+  // convertToConcreteRecipes.
+  VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
   State.set(this, VecPhi);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 44a72755b9cf8..11f0f2a930329 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1141,6 +1141,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
+    if (Phi->getOperand(0) == Phi->getOperand(1))
+      Def->replaceAllUsesWith(Phi->getOperand(0));
+    return;
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
@@ -1352,17 +1358,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     WideIV->setStartValue(NewStart);
     auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
     WideIV->setStepValue(NewStep);
-    // TODO: Remove once VPWidenIntOrFpInductionRecipe is fully expanded.
-    VPInstructionWithType *OldStepVector = WideIV->getStepVector();
-    assert(OldStepVector->getNumUsers() == 1 &&
-           "step vector should only be used by single "
-           "VPWidenIntOrFpInductionRecipe");
-    auto *NewStepVector =
-        new VPInstructionWithType(VPInstruction::StepVector, {}, NewIVTy, {},
-                                  OldStepVector->getDebugLoc());
-    NewStepVector->insertAfter(OldStepVector->getDefiningRecipe());
-    OldStepVector->replaceAllUsesWith(NewStepVector);
-    OldStepVector->eraseFromParent();
 
     auto *NewBTC = new VPWidenCastRecipe(
         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
@@ -2512,6 +2507,127 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
+/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
+/// value, phi and backedge value. In the following example:
+///
+///  vector.ph:
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      WIDEN-INDUCTION %i = phi %start, %step, %vf
+///      ...
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+///
+/// WIDEN-INDUCTION will get expanded to:
+///
+///  vector.ph:
+///    ...
+///    vp<%induction.start> = ...
+///    vp<%induction.increment> = ...
+///
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
+///      ...
+///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+static void
+expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
+                              VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = WidenIVR->getParent()->getPlan();
+  VPValue *Start = WidenIVR->getStartValue();
+  VPValue *Step = WidenIVR->getStepValue();
+  VPValue *VF = WidenIVR->getVFValue();
+  DebugLoc DL = WidenIVR->getDebugLoc();
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Type *Ty = TypeInfo.inferScalarType(WidenIVR);
+
+  const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  // FIXME: The newly created binary instructions should contain nsw/nuw
+  // flags, which can be found from the original scalar operations.
+  VPIRFlags Flags;
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+    Flags = ID.getInductionBinOp()->getFastMathFlags();
+  }
+
+  // If the phi is truncated, truncate the start and step values.
+  VPBuilder Builder(Plan->getVectorPreheader());
+  Type *StepTy = TypeInfo.inferScalarType(Step);
+  if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
+    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
+    Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
+    Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
+    StepTy = Ty;
+  }
+
+  // Construct the initial value of the vector IV in the vector loop preheader.
+  Type *IVIntTy =
+      IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
+  VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
+  if (StepTy->isFloatingPointTy())
+    Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
+
+  VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
+  VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
+
+  Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
+  Init =
+      Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+
+  // Create the widened phi of the vector IV.
+  auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
+                                       WidenIVR->getDebugLoc(), "vec.ind");
+  WidePHI->addOperand(Init);
+  WidePHI->insertBefore(WidenIVR);
+
+  // Create the backedge value for the vector IV.
+  VPValue *Inc;
+  VPValue *Prev;
+  // If unrolled, use the increment and prev value from the operands.
+  if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
+    Inc = SplatVF;
+    Prev = WidenIVR->getLastUnrolledPartOperand();
+  } else {
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    if (StepTy->isFloatingPointTy())
+      VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
+                                    DL);
+    else
+      VF =
+          Builder.createScalarCast(Instruction::CastOps::Trunc, VF, StepTy, DL);
+
+    Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
+    Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
+    Prev = WidePHI;
+  }
+
+  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
+  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
+  auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
+                                    WidenIVR->getDebugLoc(), "vec.ind.next");
+
+  WidePHI->addOperand(Next);
+
+  WidenIVR->replaceAllUsesWith(WidePHI);
+}
+
 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
   // Replace loop regions with explicity CFG.
   SmallVector<VPRegionBlock *> LoopRegions;
@@ -2619,6 +2735,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
         continue;
       }
 
+      if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+        expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
+        ToRemove.push_back(WidenIVR);
+        continue;
+      }
+
       VPValue *VectorStep;
       VPValue *ScalarStep;
       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
@@ -2929,27 +3051,6 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
   }
 }
 
-void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
-  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
-    if (!IVR)
-      continue;
-
-    Type *Ty = IVR->getPHINode()->getType();
-    if (TruncInst *Trunc = IVR->getTruncInst())
-      Ty = Trunc->getType();
-    if (Ty->isFloatingPointTy())
-      Ty = IntegerType::get(Ty->getContext(), Ty->getScalarSizeInBits());
-
-    VPBuilder Builder(Plan.getVectorPreheader());
-    VPInstruction *StepVector = Builder.createNaryOp(
-        VPInstruction::StepVector, {}, Ty, {}, IVR->getDebugLoc());
-    assert(IVR->getNumOperands() == 3 &&
-           "can only add step vector before unrolling");
-    IVR->addOperand(StepVector);
-  }
-}
-
 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a03bdb7c6882..7e51c05d1b5b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -209,11 +209,6 @@ struct VPlanTransforms {
   optimizeInductionExitUsers(VPlan &Plan,
                              DenseMap<VPValue *, VPValue *> &EndValues);
 
-  /// Materialize VPInstruction::StepVectors for VPWidenIntOrFpInductionRecipes.
-  /// TODO: Remove once all of VPWidenIntOrFpInductionRecipe is expanded in
-  /// convertToConcreteRecipes.
-  static void materializeStepVectors(VPlan &Plan);
-
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index aba113865af10..16b3913f52028 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -53,6 +53,14 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32 = icmp eq i32 %a32, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %a64 = and i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64 = icmp ne i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32ge = icmp sge i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32le = icmp slt i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32leneg = icmp sle i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32gtneg = icmp sgt i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64ge = icmp sge i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64le = icmp slt i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64leneg = icmp sle i64 %a64, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64gtneg = icmp sgt i64 %a64, -1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a128 = and i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c128 = icmp eq i128 %a128, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av16i8 = and <16 x i8> undef, undef
@@ -62,7 +70,7 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av4i32 = and <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c32not0 = icmp eq i32 %a32, 1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c64sle = icmp sle i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64sle = icmp sle i64 %a64, 0
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a8 = and i8 undef, undef
@@ -73,6 +81,17 @@ define void @andcmp() {
   %c32 = icmp eq i32 %a32, 0
   %a64 = and i64 undef, undef
   %c64 = icmp ne i64 %a64, 0
+
+  %c32ge = icmp sge i32 %a32, 1
+  %c32le = icmp slt i32 %a32, 1
+  %c32leneg = icmp sle i32 %a32, -1
+  %c32gtneg  = icmp sgt i32 %a32, -1
+
+  %c64ge = icmp sge i64 %a64, 1
+  %c64le = icmp slt i64 %a64, 1
+  %c64leneg = icmp sle i64 %a64, -1
+  %c64gtneg  = icmp sgt i64 %a64, -1
+
   %a128 = and i128 undef, undef
   %c128 = icmp eq i128 %a128, zeroinitializer
   %av16i8 = and <16 x i8> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index e162edbf611e2..7ac4db3119210 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -22,12 +22,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-LABEL: 'canonicalize_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'canonicalize_f16'
@@ -62,12 +62,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'canonicalize_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'canonicalize_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
index 06a058ff2e7b1..5b042a8a04603 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -23,13 +23,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-LABEL: 'copysign_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'copysign_f16'
@@ -67,13 +67,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'copysign_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'copysign_f16'
@@ -245,25 +245,25 @@ define void @copysign_bf16() {
 
 define void @copysign_f64() {
 ; ALL-LABEL: 'copysign_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'copysign_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.copysign.f64(double undef, double undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
new file mode 100644
index 0000000000000..a81cb63f0c51f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @maximumnum_f16() {
+; GFX7-LABEL: 'maximumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16() {
+; ALL-LABEL: 'maximumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32() {
+; ALL-LABEL: 'maximumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64() {
+; ALL-LABEL: 'maximumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @maximumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
new file mode 100644
index 0000000000000..b027ccc61266f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @minimumnum_f16() {
+; GFX7-LABEL: 'minimumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16() {
+; ALL-LABEL: 'minimumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32() {
+; ALL-LABEL: 'minimumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64() {
+; ALL-LABEL: 'minimumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @minimumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
new file mode 100644
index 0000000000000..00dbcff0a021f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=ALL,PACKEDID %s
+
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE,SIZE-UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=SIZE,SIZE-PACKEDID %s
+
+define i32 @workitem_id_x() {
+; ALL-LABEL: 'workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_y() {
+; ALL-LABEL: 'workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_z() {
+; ALL-LABEL: 'workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workgroup_id_x() {
+; ALL-LABEL: 'workgroup_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.x()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_y() {
+; ALL-LABEL: 'workgroup_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_z() {
+; ALL-LABEL: 'workgroup_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @lds_kernel_id() {
+; ALL-LABEL: 'lds_kernel_id'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'lds_kernel_id'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.lds.kernel.id()
+  ret i32 %result
+}
+
+define ptr addrspace(4) @dispatch_ptr() {
+; ALL-LABEL: 'dispatch_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'dispatch_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define i64 @dispatch_id_() {
+; ALL-LABEL: 'dispatch_id_'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i64 %result
+;
+; SIZE-LABEL: 'dispatch_id_'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %result
+;
+  %result = call i64 @llvm.amdgcn.dispatch.id()
+  ret i64 %result
+}
+
+define ptr addrspace(4) @implicitarg_ptr() {
+; ALL-LABEL: 'implicitarg_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'implicitarg_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define ptr addrspace(4) @queue_ptr() {
+; ALL-LABEL: 'queue_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'queue_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  ret ptr addrspace(4) %result
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; PACKEDID: {{.*}}
+; SIZE-PACKEDID: {{.*}}
+; SIZE-UNPACKEDID: {{.*}}
+; UNPACKEDID: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
index 244c42cc94ba0..971b14467c0f8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
@@ -74,7 +74,7 @@ define void @fptosi() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -147,7 +147,7 @@ define void @fptosi() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -440,7 +440,7 @@ define void @fptoui() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -513,7 +513,7 @@ define void @fptoui() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -806,7 +806,7 @@ define void @sitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -879,7 +879,7 @@ define void @sitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1172,7 +1172,7 @@ define void @uitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1245,7 +1245,7 @@ define void @uitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e498ccc733040..bdd8540a2c475 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -3,651 +3,328 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @sext() {
-; RV32-LABEL: 'sext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
@@ -1005,651 +682,328 @@ define void @sext() {
 }
 
 define void @zext() {
-; RV32-LABEL: 'zext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'zext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'zext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
@@ -2007,631 +1361,318 @@ define void @zext() {
 }
 
 define void @trunc() {
-; RV32-LABEL: 'trunc'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'trunc'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'trunc'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
   %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
@@ -3386,571 +2427,288 @@ define void @fptrunc() {
 }
 
 define void @fptosi() {
-; RV32-LABEL: 'fptosi'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptosi'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptosi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
@@ -4264,571 +3022,288 @@ define void @fptosi() {
 }
 
 define void @fptoui() {
-; RV32-LABEL: 'fptoui'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptoui'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptoui'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
@@ -5142,571 +3617,288 @@ define void @fptoui() {
 }
 
 define void @sitofp() {
-; RV32-LABEL: 'sitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
@@ -6020,571 +4212,288 @@ define void @sitofp() {
 }
 
 define void @uitofp() {
-; RV32-LABEL: 'uitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'uitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'uitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
@@ -6985,3 +4894,6 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
index 69d4f27ac41be..793f0dd2fe049 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
@@ -3,331 +3,168 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @icmp() {
-; RV32-LABEL: 'icmp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'icmp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'icmp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   icmp slt <2 x i1> undef, undef
   icmp slt <2 x i8> undef, undef
@@ -658,3 +495,6 @@ define void @fcmp() {
 
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e1bca71614125..437a9af8fcc83 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,47 +78,47 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 8f3219861f2fd..d97d70e99ccbf 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -11,10 +11,10 @@
 define void @reverse() {
 ;
 ; CHECK-LABEL: 'reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -22,31 +22,31 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -54,24 +54,24 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
new file mode 100644
index 0000000000000..5fbf38b2560d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-linux | FileCheck %s
+
+define i8 @andnot_add_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %sum = add i8 %not, %a1
+  %and = and i8 %sum, %a0
+  ret i8 %and
+}
+
+define i8 @andnot_sub_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %diff = sub i8 %not, %a1
+  %and = and i8 %diff, %a0
+  ret i8 %and
+}
+
+define i16 @andnot_add_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %sum = add i16 %not, %a1
+  %and = and i16 %sum, %a0
+  ret i16 %and
+}
+
+define i16 @andnot_sub_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %diff = sub i16 %not, %a1
+  %and = and i16 %diff, %a0
+  ret i16 %and
+}
+
+define i32 @andnot_add_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %sum = add i32 %not, %a1
+  %and = and i32 %sum, %a0
+  ret i32 %and
+}
+
+define i32 @andnot_sub_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %diff = sub i32 %not, %a1
+  %and = and i32 %diff, %a0
+  ret i32 %and
+}
+
+define i64 @andnot_add_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %sum = add i64 %not, %a1
+  %and = and i64 %sum, %a0
+  ret i64 %and
+}
+
+define i64 @andnot_sub_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %diff = sub i64 %not, %a1
+  %and = and i64 %diff, %a0
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
index 92b95a90d89a0..38416310b3536 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -28,10 +28,9 @@ define i128 @f4(i128 %x, i128 %y) {
   ret i128 %r
 }
 
-; FIXME: This is wrong; should be "#__aarch64_cas1_relax"
 define i8 @f5(i8 %expected, i8 %new, ptr %ptr) "target-features"="+outline-atomics" {
 ; CHECK-LABEL: "#f5":
-; CHECK: bl __aarch64_cas1_relax
+; CHECK: bl "#__aarch64_cas1_relax"
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
    %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r
@@ -43,3 +42,33 @@ define float @f6(float %val, i32 %a) {
   %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
   ret float %call
 }
+
+@dst = global [512 x i8] zeroinitializer, align 1
+@src = global [512 x i8] zeroinitializer, align 1
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memcpy(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memcpy":
+; CHECK: bl __arm_sc_memcpy
+
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memmove(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memmove":
+; CHECK: bl __arm_sc_memmove
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memset(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memset":
+; CHECK: bl __arm_sc_memset
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sme2" }
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c138..0ea80a075fae9 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    mov x8, xzr
-; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
-; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-DOT-NEXT:    add x8, x8, #16
-; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT:    cmp x8, #16
-; CHECK-DOT-NEXT:    b.ne .LBB1_1
-; CHECK-DOT-NEXT:  // %bb.2: // %end
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_in_loop:
 ; CHECK-NODOT:       // %bb.0: // %entry
 ; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -86,11 +106,6 @@ end:
 }
 
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB6_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -268,32 +317,44 @@ end:
 }
 
 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: usdot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -302,27 +363,34 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %s.wide = sext <16 x i8> %u to <16 x i32>
   %u.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
@@ -331,60 +399,67 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: sudot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB9_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -408,32 +483,44 @@ end:
 }
 
 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -460,21 +547,21 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -503,21 +590,21 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -528,45 +615,61 @@ entry:
 }
 
 define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -577,45 +680,61 @@ entry:
 }
 
 define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -626,12 +745,6 @@ entry:
 }
 
 define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
@@ -641,77 +754,53 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
-; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NODOT-NEXT:    mov x8, xzr
-; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    add x8, x8, #16
-; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT:    b.ne .LBB16_1
-; CHECK-NODOT-NEXT:  // %bb.2: // %end
-; CHECK-NODOT-NEXT:    ret
-;
-; CHECK-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v2.16b, #1
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q3, [x0, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    udot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x9, .LCPI16_2
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x10, .LCPI16_3
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q6, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v17.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v7.4s, v16.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_0
+; CHECK-COMMON-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-COMMON-NEXT:    adrp x9, .LCPI16_2
+; CHECK-COMMON-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_1
+; CHECK-COMMON-NEXT:    adrp x10, .LCPI16_3
+; CHECK-COMMON-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
+; CHECK-COMMON-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
+; CHECK-COMMON-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
+; CHECK-COMMON-NEXT:    mov x8, xzr
+; CHECK-COMMON-NEXT:  .LBB16_1: // %vector.body
+; CHECK-COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT:    ldr q6, [x0, x8]
+; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
+; CHECK-COMMON-NEXT:    add x8, x8, #16
+; CHECK-COMMON-NEXT:    cmp x8, #16
+; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
+; CHECK-COMMON-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
+; CHECK-COMMON-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
+; CHECK-COMMON-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v17.4s
+; CHECK-COMMON-NEXT:    add v7.4s, v16.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-COMMON-NEXT:    b.ne .LBB16_1
+; CHECK-COMMON-NEXT:  // %bb.2: // %end
+; CHECK-COMMON-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -731,12 +820,6 @@ end:
 }
 
 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
@@ -746,18 +829,24 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
@@ -772,18 +861,24 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
 define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
@@ -798,6 +893,18 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
@@ -822,23 +929,23 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
@@ -863,35 +970,35 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-COMMON-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-COMMON-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -900,16 +1007,16 @@ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
-; CHECK-LABEL: not_udot_narrow:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot_narrow:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-COMMON-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
+; CHECK-COMMON-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <4 x i8> %u to <4 x i32>
   %s.wide = zext <4 x i8> %s to <4 x i32>
   %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
@@ -918,18 +1025,18 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
 }
 
 define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -939,18 +1046,18 @@ entry:
 }
 
 define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -960,18 +1067,18 @@ entry:
 }
 
 define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: usdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -981,18 +1088,18 @@ entry:
 }
 
 define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sudot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -1002,74 +1109,86 @@ entry:
 }
 
 define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
-; CHECK-NOI8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #1024
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_multiple_zext_users:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NODOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #1024
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB28_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #1024
-; CHECK-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-DOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #1024
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-DOT-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #1024
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #1024
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -1100,15 +1219,15 @@ end:
 }
 
 define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){
-; CHECK-LABEL: udot_16to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_16to64:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
     %input.wide = zext <8 x i16> %input to <8 x i64>
     %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
new file mode 100644
index 0000000000000..1b541d1330aae
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
@@ -0,0 +1,579 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxvv_pow, ptr @_ZGVsMxvv_powf, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxv_acos, ptr @_ZGVsMxv_acosf, ptr @_ZGVsMxv_asin, ptr @_ZGVsMxv_asinf, ptr @_ZGVsMxv_atan, ptr @_ZGVsMxv_atanf, ptr @_ZGVsMxvv_atan2, ptr @_ZGVsMxvv_atan2f, ptr @_ZGVsMxv_cosh, ptr @_ZGVsMxv_coshf, ptr @_ZGVsMxv_sinh, ptr @_ZGVsMxv_sinhf, ptr @_ZGVsMxv_tanh, ptr @_ZGVsMxv_tanhf], section "llvm.metadata"
+;.
+define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_ceil_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_copysign_vscale_f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[MAG:%.*]], <vscale x 2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[MAG:%.*]], <vscale x 4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fabs_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fabs_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_floor_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_floor_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fma_vscale_f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_maxnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_maxnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_minnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_minnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_nearbyint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_nearbyint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_pow_vscale_f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x double> [[POW:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_pow_vscale_f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x float> [[POW:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_rint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_rint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_round_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sqrt_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sqrt_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_acos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_acos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_asin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_asin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan2_vscale_f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan2.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan2_vscale_f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan2.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cosh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cosh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sinh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sinh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tanh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+
+define <vscale x 2 x double> @llvm_trunc_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_trunc_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+sve" }
+;.
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
new file mode 100644
index 0000000000000..6323d942a08e7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
@@ -0,0 +1,577 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2vv_pow, ptr @_ZGVnN4vv_powf, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2v_acos, ptr @_ZGVnN4v_acosf, ptr @_ZGVnN2v_asin, ptr @_ZGVnN4v_asinf, ptr @_ZGVnN2v_atan, ptr @_ZGVnN4v_atanf, ptr @_ZGVnN2vv_atan2, ptr @_ZGVnN4vv_atan2f, ptr @_ZGVnN2v_cosh, ptr @_ZGVnN4v_coshf, ptr @_ZGVnN2v_sinh, ptr @_ZGVnN4v_sinhf, ptr @_ZGVnN2v_tanh, ptr @_ZGVnN4v_tanhf], section "llvm.metadata"
+;.
+define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_ceil_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_copysign_f64(<2 x double> %mag, <2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> [[MAG:%.*]], <2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sgn)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_copysign_f32(<4 x float> %mag, <4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> [[MAG:%.*]], <4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sgn)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_cosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_expf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fabs_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fabs_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_floor_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.floor.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_floor_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fma_f64(<2 x double> %a, <2 x double> %b, <2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fma_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_logf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_maxnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_maxnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_minnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_minnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_nearbyint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_nearbyint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_pow_f64(<2 x double> %in, <2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_pow(<2 x double> [[IN:%.*]], <2 x double> [[POW:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %in, <2 x double> %pow)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_pow_f32(<4 x float> %in, <4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_powf(<4 x float> [[IN:%.*]], <4 x float> [[POW:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %in, <4 x float> %pow)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_rint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.rint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_rint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.rint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_round_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_round_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.round.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.round.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_round_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_round_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.round.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sqrt_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sqrt_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_acos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_acos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.acos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_acos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_acosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.acos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_asin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_asin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.asin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_asin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_asinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.asin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_atan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_atanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan2_f64(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[INX:%.*]], <2 x double> [[INY:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan2.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan2_f32(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[INX:%.*]], <4 x float> [[INY:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan2.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cosh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cosh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cosh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cosh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_coshf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cosh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sinh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sinh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sinh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sinh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sinh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tanh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tanh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tanh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tanh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tanh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_trunc_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_trunc_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.cos.v2f64(<2 x double>)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp10.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp10.v4f32(<4 x float>)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.log.v2f64(<2 x double>)
+declare <4 x float> @llvm.log.v4f32(<4 x float>)
+declare <2 x double> @llvm.log10.v2f64(<2 x double>)
+declare <4 x float> @llvm.log10.v4f32(<4 x float>)
+declare <2 x double> @llvm.log2.v2f64(<2 x double>)
+declare <4 x float> @llvm.log2.v4f32(<4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
+declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.rint.v2f64(<2 x double>)
+declare <4 x float> @llvm.rint.v4f32(<4 x float>)
+declare <2 x double> @llvm.round.v2f64(<2 x double>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
+declare <2 x double> @llvm.sin.v2f64(<2 x double>)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de1..bd6c72a3946c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 ; Check that expensive divides are expanded into a more performant sequence
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index af813ff16a202..33d5ac4cd299e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index 8b4386e2c2216..45781fa47c6de 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=128 -use-constant-int-for-fixed-length-splat < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 221a15e5c8fe6..b2cde51e99619 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1,20 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+sme,+i8mm -force-streaming -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SME
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME
 
 define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -24,15 +27,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -42,15 +50,20 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -60,15 +73,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -78,36 +96,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -117,36 +135,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -156,41 +174,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -201,41 +207,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-LABEL: sdot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -246,82 +240,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    sunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: usdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -332,82 +306,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    uunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sudot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -418,51 +372,69 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: udot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -470,17 +442,23 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: sdot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -488,137 +466,93 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
@@ -628,47 +562,29 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
-; CHECK-LABEL: not_udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SME-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SME-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
   %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
@@ -678,47 +594,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_usdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: not_usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -728,47 +665,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_sudot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: not_sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: not_sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -778,49 +736,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -830,51 +810,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -884,51 +887,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: usdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -938,49 +964,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sudot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -990,29 +1038,26 @@ entry:
 }
 
 define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1022,31 +1067,29 @@ entry:
 }
 
 define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1056,35 +1099,26 @@ entry:
 }
 
 define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: partial_reduce_only_split_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z5.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z2.h, z3.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: partial_reduce_only_split_acc:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -1095,25 +1129,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sub z0.s, z0.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
@@ -1122,41 +1154,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
@@ -1165,27 +1215,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    mov z2.s, #255 // =0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
@@ -1194,41 +1240,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 428dd4c3a0154..e62979d077fd2 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -1,16 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
 
 define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
@@ -19,19 +11,11 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -39,12 +23,6 @@ entry:
 }
 
 define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
@@ -53,19 +31,11 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -73,12 +43,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
@@ -87,19 +51,11 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -107,12 +63,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
@@ -121,19 +71,11 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -141,12 +83,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
@@ -155,19 +91,11 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -175,12 +103,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
@@ -189,19 +111,11 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -209,16 +123,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    ptrue p0.s
-; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    ptrue p0.s
@@ -229,23 +133,13 @@ define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -253,15 +147,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    and z1.s, z1.s, #0xffff
@@ -271,21 +156,12 @@ define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -293,18 +169,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    sunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    sunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z4.d, z3.s
@@ -317,25 +181,13 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -343,18 +195,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    uunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z4.d, z3.s
@@ -367,25 +207,13 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index 4607f225f81ea..a799b51f15cb1 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 46da9d33639b6..86e73ed03f187 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2019,9 +2019,7 @@ define float @v_fneg_minimumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -2044,8 +2042,7 @@ define float @v_fneg_self_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %a)
   %min.fneg = fneg float %min
@@ -2068,8 +2065,7 @@ define float @v_fneg_posk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float 4.0, float %a)
   %fneg = fneg float %min
@@ -2092,8 +2088,7 @@ define float @v_fneg_negk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -4.0, float %a)
   %fneg = fneg float %min
@@ -2251,8 +2246,7 @@ define float @v_fneg_neg0_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -0.0, float %a)
   %fneg = fneg float %min
@@ -2299,7 +2293,6 @@ define float @v_fneg_0_minimumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_minimumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2330,9 +2323,7 @@ define <2 x float> @v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
@@ -2364,9 +2355,7 @@ define float @v_fneg_maximumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -2389,8 +2378,7 @@ define float @v_fneg_self_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %a)
   %max.fneg = fneg float %max
@@ -2413,8 +2401,7 @@ define float @v_fneg_posk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float 4.0, float %a)
   %fneg = fneg float %max
@@ -2437,8 +2424,7 @@ define float @v_fneg_negk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -4.0, float %a)
   %fneg = fneg float %max
@@ -2473,8 +2459,7 @@ define float @v_fneg_neg0_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -0.0, float %a)
   %fneg = fneg float %max
@@ -2499,7 +2484,6 @@ define float @v_fneg_0_maximumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_maximumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2530,9 +2514,7 @@ define <2 x float> @v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index a56c92785d487..92a2f54841eed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -113,17 +113,11 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -270,17 +264,11 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -771,17 +759,11 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -939,17 +921,11 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1296,19 +1272,12 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1501,19 +1470,12 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1741,19 +1703,12 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1981,19 +1936,12 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2788,4 +2736,3 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 826bf427503ab..6c4f13a4eab8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -85,17 +85,11 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 3dcc70b0ea3b6..9e82b41bb9585 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -91,17 +91,11 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -225,17 +219,11 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -646,17 +634,11 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -779,17 +761,11 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1062,19 +1038,12 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1220,19 +1189,12 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1401,19 +1363,12 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1582,19 +1537,12 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2207,4 +2155,3 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 0215795467323..8adbe861fe6f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -85,17 +85,11 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index c45d86ce306e7..4f73e8e9c1883 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -3414,8 +3414,8 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3652,57 +3652,57 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3712,8 +3712,8 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3722,11 +3722,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3735,10 +3735,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3750,11 +3750,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3767,10 +3767,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3834,12 +3834,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3939,16 +3946,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3965,16 +3972,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3988,6 +3995,18 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4003,8 +4022,8 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -4026,10 +4045,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4058,10 +4077,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4268,22 +4287,22 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4304,19 +4323,19 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4333,6 +4352,21 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4351,10 +4385,10 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
@@ -4380,15 +4414,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4419,15 +4452,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.maximumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4554,28 +4586,28 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4600,22 +4632,22 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4635,6 +4667,24 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4656,12 +4706,12 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
@@ -4691,18 +4741,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4736,18 +4785,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4978,52 +5026,52 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v12, v17
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -5031,8 +5079,8 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5064,34 +5112,34 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -5123,15 +5171,45 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_v16f16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_maximumnum_v16f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v9
@@ -5156,29 +5234,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5214,29 +5292,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5280,29 +5358,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -6174,34 +6252,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6285,34 +6363,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6396,34 +6474,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6516,34 +6594,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v18
@@ -6584,11 +6662,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6606,11 +6684,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6624,29 +6702,16 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6663,8 +6728,8 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
@@ -6784,14 +6849,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6812,14 +6877,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6836,40 +6901,19 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6889,10 +6933,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
@@ -6913,10 +6957,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_maximumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6943,10 +6987,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_max_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7030,17 +7074,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7064,17 +7108,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7094,43 +7138,22 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7153,12 +7176,12 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
@@ -7182,10 +7205,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7214,10 +7237,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7365,11 +7388,11 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f64:
@@ -7606,14 +7629,14 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f64:
@@ -7895,17 +7918,17 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
-; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f64:
@@ -8091,10 +8114,10 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8107,89 +8130,35 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8320,85 +8289,35 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_maximumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8472,87 +8391,35 @@ define float @v_maximumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_maximumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8631,14 +8498,14 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8659,11 +8526,7 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8671,82 +8534,28 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8909,12 +8718,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 5cb051d2ab857..558006d2b6957 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -3239,8 +3239,8 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3477,57 +3477,57 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3537,8 +3537,8 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3547,11 +3547,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3560,10 +3560,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3575,11 +3575,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3592,10 +3592,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3659,12 +3659,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3764,16 +3771,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3790,16 +3797,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3813,6 +3820,18 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3828,8 +3847,8 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3851,10 +3870,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3883,10 +3902,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4093,22 +4112,22 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4129,19 +4148,19 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4158,6 +4177,21 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4176,10 +4210,10 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
@@ -4205,15 +4239,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4244,15 +4277,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.minimumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4379,28 +4411,28 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4425,22 +4457,22 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4460,6 +4492,24 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4481,12 +4531,12 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
@@ -4516,18 +4566,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4561,18 +4610,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4803,52 +4851,52 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v12, v17
+; GFX8-GISEL-NEXT:    v_min_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -4856,8 +4904,8 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_minimumnum_v16f16:
@@ -4889,34 +4937,34 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4948,15 +4996,45 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_v16f16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_minimumnum_v16f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v1, v1, v9
@@ -4981,29 +5059,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5039,29 +5117,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5105,29 +5183,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -5999,34 +6077,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6110,34 +6188,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6221,34 +6299,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6341,34 +6419,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v18
@@ -6409,11 +6487,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6431,11 +6509,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6449,29 +6527,16 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6488,8 +6553,8 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
@@ -6609,14 +6674,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6637,14 +6702,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6661,40 +6726,19 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6714,10 +6758,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
@@ -6738,10 +6782,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_minimumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6768,10 +6812,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_min_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6855,17 +6899,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6889,17 +6933,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6919,43 +6963,22 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6978,12 +7001,12 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
@@ -7007,10 +7030,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7039,10 +7062,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7190,11 +7213,11 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f64:
@@ -7431,14 +7454,14 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f64:
@@ -7720,17 +7743,17 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
-; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f64:
@@ -7916,10 +7939,10 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7932,89 +7955,35 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8145,85 +8114,35 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_minimumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8297,87 +8216,35 @@ define float @v_minimumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_minimumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8456,14 +8323,14 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8484,11 +8351,7 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8496,82 +8359,28 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8734,12 +8543,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
deleted file mode 100644
index 1cdd027fef89d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
+++ /dev/null
@@ -1,286 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s
-
-define i8 @test_v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca <4 x i8>, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [4 x i8], align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a3i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a3i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s4i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s3i8(
-; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
-  store i40 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8i8s0(
-; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <2 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
-  store i16 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; heterogeneous element types are not supported
-define i8 @test_heterogeneous(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_heterogeneous(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
-; CHECK-NEXT:    store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; empty types are not supported
-define void @test_empty() {
-; CHECK-LABEL: define void @test_empty() {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
-; CHECK-NEXT:    ret void
-;
-  %stack = alloca {}, align 4, addrspace(5)
-  ret void
-}
-
-; singleton types are not supported
-define i8 @test_singleton(i8 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_singleton(
-; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
-; CHECK-NEXT:    store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, {}}, align 4, addrspace(5)
-  store i8 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
index 202609c8156a7..78045ddcd85aa 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
@@ -5,25 +5,25 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
 ; Confirm that DXILFinalizeLinkage will remove functions that have compatible
 ; linkage and are not called from anywhere. This should be any function that
-; is not explicitly marked export and is not an entry point.
+; is marked hidden or internal.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden, and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
 
-; No inlining attribute and called, this should stay.
+; No inlining attribute, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -154,27 +154,6 @@ entry:
   ret void
 }
 
-; No inlining attribute, internal, and exported; this should stay.
-; CHECK: define {{.*}}doInternalExported
-define internal void @"?doInternalExported@@YAXXZ"() #3 {
-entry:
-  ret void
-}
-
-; Alwaysinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doAlwaysInlineInternalExported
-define internal void @"?doAlwaysInlineInternalExported@@YAXXZ"() #1 {
-entry:
-  ret void
-}
-
-; Noinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doNoinlineInternalExported
-define internal void @"?doNoinlineInternalExported@@YAXXZ"() #5 {
-entry:
-  ret void
-}
-
 ; Marked external and exported, this should stay.
 ; CHECK: define {{.*}}doExternalExported
 define external void @"?doExternalExported@@YAXXZ"() #3 {
@@ -213,10 +192,10 @@ entry:
 }
 
 attributes #0 = { alwaysinline convergent norecurse nounwind }
-attributes #1 = { alwaysinline convergent norecurse nounwind "hlsl.export"}
+attributes #1 = { alwaysinline convergent norecurse nounwind }
 attributes #2 = { convergent norecurse nounwind }
-attributes #3 = { convergent norecurse nounwind "hlsl.export"}
+attributes #3 = { convergent norecurse nounwind }
 attributes #4 = { convergent noinline norecurse nounwind }
-attributes #5 = { convergent noinline norecurse nounwind "hlsl.export"}
+attributes #5 = { convergent noinline norecurse nounwind }
 attributes #6 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 attributes #7 = { convergent }
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
index 49c3bda621d74..971451f981c99 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
@@ -7,23 +7,23 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 ; linkage and are not called from anywhere. This should be any function that
 ; is not an entry point.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden and is uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
 
 ; No inlining attribute and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
index c761a79a5c28a..df691db5cff36 100644
--- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll
+++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
@@ -3,8 +3,8 @@
 
 target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
-; DXILFinalizeLinkage changes linkage of all functions that are not
-; entry points or exported function to internal.
+; DXILFinalizeLinkage changes linkage of all functions that are hidden to
+; internal.
 
 ; CHECK-NOT: define internal void @"?f1@@YAXXZ"()
 define void @"?f1@@YAXXZ"() #0 {
@@ -13,19 +13,19 @@ entry:
 }
 
 ; CHECK: define internal void @"?f2@@YAXXZ"()
-define void @"?f2@@YAXXZ"() #0 {
+define hidden void @"?f2@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?f3@@YAXXZ"()
-define void @"?f3@@YAXXZ"() #0 {
+define hidden void @"?f3@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?foo@@YAXXZ"()
-define void @"?foo@@YAXXZ"() #0 {
+define hidden void @"?foo@@YAXXZ"() #0 {
 entry:
   call void @"?f2@@YAXXZ"() #3
   ret void
@@ -33,7 +33,7 @@ entry:
 
 ; Exported function - do not change linkage
 ; CHECK: define void @"?bar@@YAXXZ"()
-define void @"?bar@@YAXXZ"() #1 {
+define void @"?bar@@YAXXZ"() #0 {
 entry:
   call void @"?f3@@YAXXZ"() #3
   ret void
@@ -42,23 +42,22 @@ entry:
 ; CHECK: define internal void @"?main@@YAXXZ"() #0
 define internal void @"?main@@YAXXZ"() #0 {
 entry:
-  call void @"?foo@@YAXXZ"() #3
-  call void @"?bar@@YAXXZ"() #3
+  call void @"?foo@@YAXXZ"() #2
+  call void @"?bar@@YAXXZ"() #2
   ret void
 }
 
 ; Entry point function - do not change linkage
-; CHECK: define void @main() #2
-define void @main() #2 {
+; CHECK: define void @main() #1
+define void @main() #1 {
 entry:
   call void @"?main@@YAXXZ"()
   ret void
 }
 
 attributes #0 = { convergent noinline nounwind optnone}
-attributes #1 = { convergent noinline nounwind optnone "hlsl.export"}
-attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
-attributes #3 = { convergent }
+attributes #1 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
+attributes #2 = { convergent }
 
 ; Make sure "hlsl.export" attribute is stripped by llc
 ; CHECK-LLC-NOT: "hlsl.export"
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 5c761014d471f..dc8c5f8421bfe 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -187,5 +187,75 @@ define void @global_gep_store() {
   ret void
 }
 
+@g = local_unnamed_addr addrspace(3) global [2 x [2 x float]] zeroinitializer, align 4
+define void @two_index_gep() {
+  ; CHECK-LABEL: define void @two_index_gep(
+  ; CHECK: [[THREAD_ID:%.*]] =  tail call i32 @llvm.dx.thread.id(i32 0)
+  ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[THREAD_ID]], 2
+  ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, [[MUL]]
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 [[ADD]]
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = tail call i32 @llvm.dx.thread.id(i32 0)
+  %2 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 %1, i32 1
+  %3 = load float, ptr addrspace(3) %2, align 4
+  ret void
+}
+
+define void @two_index_gep_const() {
+  ; CHECK-LABEL: define void @two_index_gep_const(
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 3
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 1, i32 1
+  %3 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+define void @gep_4d_index_test()  {
+    ; CHECK-LABEL: gep_4d_index_test
+    ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 1
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 3
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 7
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 15
+    ; CHECK-NEXT:    ret void
+    %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+    %2 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 0, i32 1
+    %3 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 1, i32 1
+    %4 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1, i32 1, i32 1
+    %5 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1, i32 1, i32 1
+    ret void
+}
+
+define void @gep_4d_index_and_gep_chain_mixed() {
+  ; CHECK-LABEL: gep_4d_index_and_gep_chain_mixed
+  ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4
+  ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[ALLOCA]], i32 0, i32 {{[0-9]|1[0-5]}}
+  ; CHECK-NEXT: ret void
+  %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+  %a4d0_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0
+  %a2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 0
+  %a2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 1
+  %a2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 0
+  %a2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 1
+  %b4d0_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1
+  %b2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 0
+  %b2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 1
+  %b2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 0
+  %b2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 1
+  %c4d1_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 0
+  %c2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 0
+  %c2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 1
+  %c2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 0
+  %c2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 1
+  %g4d1_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1
+  %g2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 0
+  %g2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 1
+  %g2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 0
+  %g2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 1
+  ret void
+}
+
 ; Make sure we don't try to walk the body of a function declaration.
 declare void @opaque_function()
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index c960aad3d2627..d5797f6b51348 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -60,19 +60,15 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @vecData.scalarized to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP4]], i32 1
-; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP8]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3), align 4
+; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4
   ret <4 x i32> %1
@@ -103,31 +99,23 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
-; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
-; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
-; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
-; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
-; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
-; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1), align 4
+; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2), align 4
+; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3), align 4
+; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP2]], [[DOTI13]]
+; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP3]], [[DOTI25]]
+; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP4]], [[DOTI37]]
+; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
+; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
+; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
new file mode 100644
index 0000000000000..1f33700e014c7
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S --dxil-prepare %s | FileCheck %s
+
+; Test that global arrays do not get a bitcast instruction
+; after the dxil-prepare pass.
+
+target triple = "dxilv1.2-unknown-shadermodel6.2-compute"
+
+@inputTile.1dim = local_unnamed_addr addrspace(3) global [3 x float] zeroinitializer, align 2
+
+; CHECK-LABEL: testload
+define float @testload() local_unnamed_addr {
+  ; NOTE: this would be "bitcast ptr addrspace(3)..." before the change that introduced this test,
+  ; after the dxil-prepare pass is run
+  ; CHECK-NEXT: load float, ptr addrspace(3) @inputTile.1dim, align 2
+  %v = load float, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret float %v
+}
+
+; CHECK-LABEL: teststore
+define void @teststore() local_unnamed_addr {  
+  ; CHECK-next: store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2
+  store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret void
+}
+
+; CHECK-LABEL: testGEPConst
+define float @testGEPConst() local_unnamed_addr {  
+  ; CHECK-NEXT: load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  %v = load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testGEPNonConst
+define float @testGEPNonConst(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %gep = getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %v = load float, ptr addrspace(3) %gep
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testAlloca
+define float @testAlloca(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: alloca [3 x float], align 4
+  %arr = alloca [3 x float], align 4
+  ; CHECK-NEXT: getelementptr [3 x float], ptr %arr, i32 1
+  %gep = getelementptr [3 x float], ptr %arr, i32 1
+  %v = load float, ptr %gep
+  ret float %v
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index e564d7bddea6f..27be02c50f1c7 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -885,9 +885,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 define i8 @test_not_cttz_i8(i8 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i8:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    andi $a1, $a1, 85
 ; LA32R-NEXT:    sub.w $a0, $a0, $a1
@@ -921,9 +920,8 @@ define i8 @test_not_cttz_i8(i8 %a) nounwind {
 define i16 @test_not_cttz_i16(i16 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i16:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    lu12i.w $a2, 5
 ; LA32R-NEXT:    ori $a2, $a2, 1365
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index c4d1537557cad..4e11f58f85ee0 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -10,14 +11,29 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; CHECK-LABEL: spam
 define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+; CHECK-LABEL: spam(
+; CHECK:       .maxntid 1, 1, 1
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %bb
+; CHECK-NEXT:    ld.param.b64 %rd1, [spam_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [spam_param_3];
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 1;
+; CHECK-NEXT:    add.s64 %rd4, %rd1, %rd3;
+; CHECK-NEXT:    ld.param.b64 %rd5, [spam_param_1];
+; CHECK-NEXT:    ld.global.nc.s16 %r1, [%rd4+16];
+; CHECK-NEXT:    mul.wide.s32 %rd6, %r1, %r1;
+; CHECK-NEXT:    ld.global.b64 %rd7, [%rd5];
+; CHECK-NEXT:    add.s64 %rd8, %rd6, %rd7;
+; CHECK-NEXT:    st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT:    ret;
 bb:
   %tmp5 = add nsw i64 %arg3, 8
   %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
-; CHECK: ld.global.nc.b16
   %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
-; CHECK: cvt.s32.s16
   %tmp8 = sext i16 %tmp7 to i64
   %tmp9 = mul nsw i64 %tmp8, %tmp8
   %tmp10 = load i64, ptr addrspace(1) %arg1, align 8
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 3b30ce560edbc..6148c0756e393 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -7,45 +8,93 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-; CHECK-LABEL: ex_zext
 define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.u32.u8
   %valext = zext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext
 define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.s8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_zext_v2
 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r2, %r1};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.u32.u16
   %valext = zext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext_v2
 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.s32.s8 %r2, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    cvt.s32.s8 %r4, %r3;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r4, %r2};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index bb88d1f2755ca..3dceefb93a47d 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -7,7 +7,6 @@ target triple = "nvptx-nvidia-cuda"
 
 define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
-; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
 ; CHECK:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
@@ -15,8 +14,7 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK:    ld.param.b64 %rd3, [foo_param_1];
 ; CHECK:    cvta.to.global.u64 %rd4, %rd3;
-; CHECK:    ld.global.nc.b8 %rs1, [%rd2];
-; CHECK:    cvt.u32.u8 %r1, %rs1;
+; CHECK:    ld.global.nc.b8 %r1, [%rd2];
 ; CHECK:    add.s32 %r2, %r1, 1;
 ; CHECK:    and.b32 %r3, %r2, 1;
 ; CHECK:    st.global.b32 [%rd4], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 7ac697c4ce203..7f4b049af84fb 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i8_param_0];
-; CHECK-NEXT:    ld.global.nc.b8 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u8 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i16_param_0];
-; CHECK-NEXT:    ld.global.nc.b16 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b16 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 3bbdf641ade26..ddaa9fd831af7 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot3[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
+; CHECK-PTX-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs6;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
-; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT:    mov.b16 %rs4, 1;
+; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs4;
 ; CHECK-PTX-NEXT:    mov.b64 %rd3, 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 8;
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
new file mode 100644
index 0000000000000..67800df6ed4b5
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
@@ -0,0 +1,588 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=64BIT %s
+
+%struct.S0 = type {}
+
+%struct.S1 = type { [1 x i8] }
+@gS1 = external global %struct.S1, align 1
+
+define void @call_test_byval_1Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 0, killed renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 0, killed renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 56, 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
+  ret void
+}
+
+define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = COPY $r3
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM $r3, 8, 24, 31
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x4 = COPY $x3
+  ; 64BIT-NEXT:   renamable $x3 = RLDICL $x3, 8, 56
+  ; 64BIT-NEXT:   STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %0 = load i8, ptr %s, align 1
+  ret i8 %0
+}
+
+@f = common global float 0.000000e+00, align 4
+
+%struct.S2 = type { [2 x i8] }
+
+@gS2 = external global %struct.S2, align 1
+
+define void @call_test_byval_2Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 0, killed renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r5 = RLWINM killed renamable $r3, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 32BIT-NEXT:   $r7 = LI 43
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 0, killed renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = RLDICR killed renamable $x3, 48, 15
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 64BIT-NEXT:   $x7 = LI8 43
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f, align 4
+  %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
+  ret void
+}
+
+define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+  ; 32BIT-LABEL: name: test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r5
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x5
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
+  %4 = load i8, ptr %arrayidx, align 1
+  ret i8 %4
+}
+
+%struct.S3 = type <{ i8, i16 }>
+@gS3 = external global %struct.S3, align 1
+
+define void @call_test_byval_3Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LI 42
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @gS3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 2, renamable $r4 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 0, killed renamable $r4 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r3, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LI8 42
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @gS3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 2, renamable $x4 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 0, killed renamable $x4 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x10 = RLDIC killed renamable $x3, 40, 16
+  ; 64BIT-NEXT:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x4, 48, 0
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
+  ret void
+}
+
+define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+  ; 32BIT-LABEL: name: test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
+  %8 = load i16, ptr %gep, align 1
+  ret i16 %8
+}
+
+%struct.S4 = type { [4 x i8] }
+%struct.S4A = type { i32 }
+
+@gS4 = external global %struct.S4, align 1
+
+define void @call_test_byval_4Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x4 = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x4, 32, 31
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %s4a = alloca %struct.S4A, align 8
+  %call = call signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 @gS4, ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S4A) align 4 %s4a)
+  ret void
+}
+
+define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+  ; 32BIT-LABEL: name: test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 0, 24, 31
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
+  ; 64BIT-NEXT:   STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $r3 = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICL killed renamable $x4, 32, 32
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3, implicit killed $x4
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
+  %1 = load i8, ptr %arrayidx, align 1
+  %2 = load i32, ptr %s4a, align 4
+  %conv = zext i8 %1 to i32
+  %add = add nsw i32 %2, %conv
+  ret i32 %add
+}
+
+%struct.S5 = type { [5 x i8] }
+
+@gS5 = external global %struct.S5, align 1
+
+define void @call_test_byval_5Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_5Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 4, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_5Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LBZ8 4, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 24, 0, 7
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
+  ret void
+}
+
+declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
+
+%struct.S6 = type { [6 x i8] }
+
+@gS6 = external global %struct.S6, align 1
+
+define void @call_test_byval_6Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_6Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_6Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
+  ret void
+}
+
+declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
+
+%struct.S7 = type { [7 x i8] }
+
+@gS7 = external global %struct.S7, align 1
+
+define void @call_test_byval_7Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_7Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 6, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r5, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_7Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LBZ8 6, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x6 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x5, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x6, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
+  ret void
+}
+
+declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
+
+%struct.S8 = type { [8 x i8] }
+
+@gS8 = external global %struct.S8, align 1
+
+define void @call_test_byval_8Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_8Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_8Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
+  ret void
+}
+
+declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
+
+%struct.S32 = type { [32 x i8] }
+
+@gS32 = external global %struct.S32, align 1
+
+define void @call_test_byval_32Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS32, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 28, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS32, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x6 = LD 24, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
+  ret void
+}
+
+define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
+  %0 = load i8, ptr %arrayidx, align 1
+  ret i8 %0
+}
+
+; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
+; details on why.
+
+%struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
+
+@gS31 = external global %struct.S31, align 1
+
+define void @call_test_byval_31Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS31, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r11 = LHZ 28, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = LBZ 30, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r11, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS31, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x7 = LHZ8 28, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x6 = LBZ8 30, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x8 = LWZ8 24, renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWINM8 killed renamable $x6, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x7, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x8, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
+  ret void
+}
+
+define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
+  %load = load double, ptr %gep, align 1
+  ret double %load
+}
+
+%struct.F = type { float, float, float }
+
+define i32 @call_test_byval_homogeneous_float_struct() {
+  ; 32BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   STW renamable $r3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 32BIT-NEXT:   STW renamable $r3, 4, %stack.0.s :: (store (s32) into %ir.s + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.0.s :: (store (s32) into %ir.s, align 8)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
+  ; 32BIT-NEXT:   $r3 = LI 0
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LI8 0
+  ; 64BIT-NEXT:   STW8 renamable $x3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.0.s :: (store (s64) into %ir.s)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   $x3 = LI8 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %s = alloca %struct.F, align 8
+  call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
+  %call = call i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4 %s)
+  ret i32 %call
+}
+
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+
+declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
index 5e7a1bc81916e..a06b61fc45334 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
@@ -1,18 +1,11 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,32BIT %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s
-
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,64BIT %s
+; RUN: FileCheck --check-prefixes=32BIT %s
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc64-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s
+; RUN: FileCheck --check-prefixes=64BIT %s
 
 %struct.S0 = type {}
 
@@ -20,96 +13,60 @@
 @gS1 = external global %struct.S1, align 1
 
 define void @call_test_byval_1Byte() {
+; 32BIT-LABEL: call_test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C0(2) # @gS1
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 3, 0(3)
+; 32BIT-NEXT:    slwi 3, 3, 24
+; 32BIT-NEXT:    bl .test_byval_1Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C0(2) # @gS1
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lbz 3, 0(3)
+; 64BIT-NEXT:    sldi 3, 3, 56
+; 64BIT-NEXT:    bl .test_byval_1Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_1Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $r3 = LBZ 0, killed renamable $r[[REG]] :: (load (s8))
-; 32BIT-NEXT:  renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_1Byte:
-
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-NEXT:  stw 0, 72(1)
-; ASM32-NEXT:  lbz 3, 0([[REG]])
-; ASM32-NEXT:  slwi 3, 3, 24
-; ASM32-NEXT:  bl .test_byval_1Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REG:[0-9]+]] = LDtoc @gS1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LBZ8 0, killed renamable $x[[REG]] :: (load (s8))
-; 64BIT-NEXT:  renamable $x3 = RLDICR killed renamable $x3, 56, 7
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 144(1)
-; ASM64-NEXT:  lbz 3, 0([[REG]])
-; ASM64-NEXT:  sldi 3, 3, 56
-; ASM64-NEXT:  bl .test_byval_1Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+; 32BIT-LABEL: test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mr 4, 3
+; 32BIT-NEXT:    srwi 3, 3, 24
+; 32BIT-NEXT:    stw 4, 24(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mr 4, 3
+; 64BIT-NEXT:    rldicl 3, 3, 8, 56
+; 64BIT-NEXT:    std 4, 48(1)
+; 64BIT-NEXT:    blr
 entry:
   %0 = load i8, ptr %s, align 1
   ret i8 %0
 }
 
-; CHECK-LABEL: name:            test_byval_1Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:         - { id: 1, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:       bb.0.entry:
-; 32BIT-NEXT:    liveins: $r3
-; 32BIT:         renamable $r4 = COPY $r3
-; 32BIT:         renamable $r3 = RLWINM $r3, 8, 24, 31
-; 32BIT:         STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
-; 32BIT-NEXT:    BLR
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:    - { id: 0, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:         - { id: 1, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        renamable $x4 = COPY $x3
-; 64BIT:        renamable $x3 = RLDICL $x3, 8, 56
-; 64BIT:        STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-
-; CHECKASM-LABEL: .test_byval_1Byte:
-
-; ASM32:      mr 4, 3
-; ASM32-NEXT: srwi 3, 3, 24
-; ASM32-NEXT: stw 4, 24(1)
-; ASM32-NEXT: blr
-
-; ASM64:      mr 4, 3
-; ASM64-NEXT: rldicl 3, 3, 8, 56
-; ASM64-NEXT: std 4, 48(1)
-; ASM64-NEXT: blr
-
-
 @f = common global float 0.000000e+00, align 4
 
 %struct.S2 = type { [2 x i8] }
@@ -117,240 +74,184 @@ entry:
 @gS2 = external global %struct.S2, align 1
 
 define void @call_test_byval_2Byte() {
+; 32BIT-LABEL: call_test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C1(2) # @f
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 7, 43
+; 32BIT-NEXT:    lfs 1, 0(3)
+; 32BIT-NEXT:    lwz 3, L..C2(2) # @gS2
+; 32BIT-NEXT:    lhz 3, 0(3)
+; 32BIT-NEXT:    fmr 2, 1
+; 32BIT-NEXT:    slwi 5, 3, 16
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    bl .test_byval_2Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C1(2) # @f
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    li 7, 43
+; 64BIT-NEXT:    lfs 1, 0(3)
+; 64BIT-NEXT:    ld 3, L..C2(2) # @gS2
+; 64BIT-NEXT:    lhz 3, 0(3)
+; 64BIT-NEXT:    fmr 2, 1
+; 64BIT-NEXT:    sldi 5, 3, 48
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    bl .test_byval_2Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %0 = load float, ptr @f, align 4
   %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_2Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       renamable $r[[REG1:[0-9]+]] = LWZtoc @f, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $r[[REG1]] :: (dereferenceable load (s32) from @f)
-; 32BIT-NEXT:  ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 42
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LWZtoc @gS2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LHZ 0, killed renamable $r[[REG2]] :: (load (s16))
-; 32BIT-DAG:   renamable $r5 = RLWINM killed renamable $r[[REG3]], 16, 0, 15
-; 32BIT-DAG:   $f2 = COPY renamable $f1
-; 32BIT-DAG:   $r7 = LI 43
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_2Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 42
-; ASM32-DAG:   lwz [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lfs 1, 0([[REG1]])
-; ASM32-DAG:   lwz [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM32-DAG:   slwi 5, [[REG3]], 16
-; ASM32-DAG:   fmr 2, 1
-; ASM32-DAG:   li 7, 43
-; ASM32-NEXT:  bl .test_byval_2Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       renamable $x[[REG1:[0-9]+]] = LDtoc @f, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $x[[REG1]] :: (dereferenceable load (s32) from @f)
-; 64BIT-NEXT:  ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 42
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LDtoc @gS2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LHZ8 0, killed renamable $x[[REG2]] :: (load (s16))
-; 64BIT-DAG:   renamable $x5 = RLDICR killed renamable $x[[REG3]], 48, 15
-; 64BIT-DAG:   $f2 = COPY renamable $f1
-; 64BIT-DAG:   $x7 = LI8 43
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-DAG:   std 0, 128(1)
-; ASM64-DAG:   li 3, 42
-; ASM64-DAG:   ld [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lfs 1, 0([[REG1]])
-; ASM64-DAG:   ld [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM64-DAG:   sldi 5, [[REG3]], 48
-; ASM64-DAG:   fmr 2, 1
-; ASM64-DAG:   li 7, 43
-; ASM64-NEXT:  bl .test_byval_2Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 112
-
 define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+; 32BIT-LABEL: test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    lbz 3, 33(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lbz 3, 65(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
   %4 = load i8, ptr %arrayidx, align 1
   ret i8 %4
 }
 
-; CHECK-LABEL: name:            test_byval_2Byte
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 32, size: 4, alignment: 16, stack-id: default,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r5
-; 32BIT:        STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
-; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 64, size: 8, alignment: 16, stack-id: default,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x5
-; 64BIT:        STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; CHECKASM-LABEL: .test_byval_2Byte:
-
-; ASM32:        stw 5, 32(1)
-; ASM32-NEXT:   lbz 3, 33(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 5, 64(1)
-; ASM64-NEXT:   lbz 3, 65(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S3 = type <{ i8, i16 }>
 @gS3 = external global %struct.S3, align 1
 
 define void @call_test_byval_3Byte() {
+; 32BIT-LABEL: call_test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 4, L..C3(2) # @gS3
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 5, 3
+; 32BIT-NEXT:    li 6, 4
+; 32BIT-NEXT:    li 7, 5
+; 32BIT-NEXT:    stw 3, 56(1)
+; 32BIT-NEXT:    li 8, 6
+; 32BIT-NEXT:    li 9, 7
+; 32BIT-NEXT:    lbz 3, 2(4)
+; 32BIT-NEXT:    lhz 4, 0(4)
+; 32BIT-NEXT:    rlwinm 10, 3, 8, 16, 23
+; 32BIT-NEXT:    li 3, 1
+; 32BIT-NEXT:    rlwimi 10, 4, 16, 0, 15
+; 32BIT-NEXT:    li 4, 2
+; 32BIT-NEXT:    bl .test_byval_3Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 4, L..C3(2) # @gS3
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    li 5, 3
+; 64BIT-NEXT:    li 6, 4
+; 64BIT-NEXT:    li 7, 5
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    li 8, 6
+; 64BIT-NEXT:    li 9, 7
+; 64BIT-NEXT:    lbz 3, 2(4)
+; 64BIT-NEXT:    lhz 4, 0(4)
+; 64BIT-NEXT:    rldic 10, 3, 40, 16
+; 64BIT-NEXT:    li 3, 1
+; 64BIT-NEXT:    rldimi 10, 4, 48, 0
+; 64BIT-NEXT:    li 4, 2
+; 64BIT-NEXT:    bl .test_byval_3Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_3Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS3, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 0, killed renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 2, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LI 42
-; 32BIT-DAG:   STW killed renamable $r[[REG3]], 56, $r1 :: (store (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_3Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 1
-; ASM32-DAG:   li 4, 2
-; ASM32-DAG:   li 5, 3
-; ASM32-DAG:   li 6, 4
-; ASM32-DAG:   li 7, 5
-; ASM32-DAG:   li 8, 6
-; ASM32-DAG:   li 9, 7
-; ASM32-DAG:   lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG1]], 16, 0, 15
-; ASM32-DAG:   li [[REG3:[0-9]+]], 42
-; ASM32-DAG:   stw [[REG3]], 56(1)
-; ASM32-NEXT:  bl .test_byval_3Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS3, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LHZ8 0, killed renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 2, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x10 = RLDIC killed renamable $x[[REG2]], 40, 16
-; 64BIT-DAG:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x[[REG1]], 48, 0
-; 64BIT-DAG:   $x[[REG3:[0-9]+]] = LI8 42
-; 64BIT-DAG:   STD killed renamable $x[[REG3]], 112, $x1 :: (store (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64-DAG:   li 3, 1
-; ASM64-DAG:   li 4, 2
-; ASM64-DAG:   li 5, 3
-; ASM64-DAG:   li 6, 4
-; ASM64-DAG:   li 7, 5
-; ASM64-DAG:   li 8, 6
-; ASM64-DAG:   li 9, 7
-; ASM64-DAG:   ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM64-DAG:   rldic 10, [[REG2]], 40, 16
-; ASM64-DAG:   rldimi 10, [[REG1]], 48, 0
-; ASM64-DAG:   li [[REG3:[0-9]+]], 42
-; ASM64-DAG:   std [[REG3]], 112(1)
-; ASM64-NEXT:  bl .test_byval_3Byte
-; ASM64-NEXT:  nop
-
-
 define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+; 32BIT-LABEL: test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    lhz 3, 53(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 10, 104(1)
+; 64BIT-NEXT:    lhz 3, 105(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
   %8 = load i16, ptr %gep, align 1
   ret i16 %8
 }
 
-; CHECK-LABEL: name:            test_byval_3Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 56, size: 4, alignment: 8, stack-id: default,
-; 32BIT:         - { id: 1, type: default, offset: 52, size: 4, alignment: 4, stack-id: default,
-
-; 32BIT-LABEL: bb.0.entry:
-; 32BIT-NEXT:    liveins: $r10
-; 32BIT:         STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
-; 32BIT-NEXT:    renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:     - { id: 0, type: default, offset: 116, size: 4, alignment: 4, stack-id: default,
-; 64BIT:          - { id: 1, type: default, offset: 104, size: 8, alignment: 8, stack-id: default,
-
-; 64BIT-LABEL: bb.0.entry:
-; 64BIT-NEXT:    liveins: $x10
-; 64BIT:         STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
-; 64BIT-NEXT:    renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; CHECKASM-LABEL: .test_byval_3Byte:
-
-; ASM32:        stw 10, 52(1)
-; ASM32-NEXT:   lhz 3, 53(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 10, 104(1)
-; ASM64-NEXT:   lhz 3, 105(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S4 = type { [4 x i8] }
 %struct.S4A = type { i32 }
 
 @gS4 = external global %struct.S4, align 1
 
 define void @call_test_byval_4Byte() {
+; 32BIT-LABEL: call_test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    lwz 3, L..C4(2) # @gS4
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    lwz 4, 64(1)
+; 32BIT-NEXT:    bl .test_byval_4Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C4(2) # @gS4
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lwz 3, 0(3)
+; 64BIT-NEXT:    lwz 4, 112(1)
+; 64BIT-NEXT:    sldi 3, 3, 32
+; 64BIT-NEXT:    sldi 4, 4, 32
+; 64BIT-NEXT:    bl .test_byval_4Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %s4a = alloca %struct.S4A, align 8
@@ -358,46 +259,24 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_4Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS4, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REG]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3,  implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_4Byte:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REG]])
-; ASM32-DAG:   lwz 4, 64(1)
-; ASM32-NEXT:  bl .test_byval_4Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 80
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS4, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[LD1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[LD2:[0-9]+]] = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 64BIT-DAG:   renamable $x3 = RLDICR killed renamable $x[[LD1]], 32, 31
-; 64BIT-DAG:   renamable $x4 = RLDICR killed renamable $x[[LD2]], 32, 31
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3,  implicit $x4, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[LD1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lwz [[LD2:[0-9]+]], 112(1)
-; ASM64-DAG:   sldi 3, [[LD1]], 32
-; ASM64-DAG:   sldi 4, [[LD2]], 32
-; ASM64-NEXT:  bl .test_byval_4Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+; 32BIT-LABEL: test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    clrlwi 3, 3, 24
+; 32BIT-NEXT:    add 3, 4, 3
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 51(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    rldicl 4, 4, 32, 32
+; 64BIT-NEXT:    add 3, 4, 3
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
   %1 = load i8, ptr %arrayidx, align 1
@@ -407,64 +286,43 @@ entry:
   ret i32 %add
 }
 
-; CHECK-LABEL: name:            test_byval_4Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 1, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 2, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3
-; 32BIT:        STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
-; 32BIT-DAG:    STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
-; 32BIT-DAG:    renamable $r[[SCRATCH:[0-9]+]] = RLWINM killed renamable $r3, 0, 24, 31
-; 32BIT-DAG:    renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r[[SCRATCH]]
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT: - { id: 0, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 1, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 2, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
-; 64BIT:        STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
-; 64BIT-DAG:    renamable $r[[SCRATCH1:[0-9]+]] = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8)
-; 64BIT-DAG:    renamable $x[[SCRATCH2:[0-9]+]] = RLDICL killed renamable $x4, 32, 32
-; 64BIT-NEXT:   renamable $r[[SCRATCH3:[0-9]+]] = nsw ADD4 renamable $r[[SCRATCH2]], killed renamable $r[[SCRATCH1]], implicit killed $x[[SCRATCH2]]
-; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r[[SCRATCH3]]
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_4Byte:
-
-; ASM32:        stw 3, 24(1)
-; ASM32-DAG:    stw 4, 28(1)
-; ASM32-DAG:    clrlwi  [[SCRATCH:[0-9]+]], 3, 24
-; ASM32-DAG:    add 3, 4, [[SCRATCH]]
-; ASM32-NEXT:   blr
-
-; ASM64:        std 3, 48(1)
-; ASM64-NEXT:   lbz [[SCRATCH1:[0-9]+]], 51(1)
-; ASM64-NEXT:   std 4, 56(1)
-; ASM64-NEXT:   rldicl [[SCRATCH2:[0-9]+]], 4, 32, 32
-; ASM64-NEXT:   add [[SCRATCH3:[0-9]+]], [[SCRATCH2]], [[SCRATCH1]]
-; ASM64-NEXT:   extsw 3, [[SCRATCH3]]
-; ASM64-NEXT:   blr
-
-
 %struct.S5 = type { [5 x i8] }
 
 @gS5 = external global %struct.S5, align 1
 
 define void @call_test_byval_5Byte() {
+; 32BIT-LABEL: call_test_byval_5Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C5(2) # @gS5
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 24
+; 32BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_5Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C5(2) # @gS5
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 24, 0, 7
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
   ret void
@@ -472,54 +330,43 @@ entry:
 
 declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
 
-; CHECK-LABEL: name: call_test_byval_5Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS5, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LBZ 4, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_5Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lbz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 24
-; ASM32-NEXT:  bl .test_byval_5Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 24, 0, 7
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_5Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S6 = type { [6 x i8] }
 
 @gS6 = external global %struct.S6, align 1
 
 define void @call_test_byval_6Byte() {
+; 32BIT-LABEL: call_test_byval_6Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C6(2) # @gS6
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lhz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 16
+; 32BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_6Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C6(2) # @gS6
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
   ret void
@@ -527,54 +374,47 @@ entry:
 
 declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
 
-; CHECK-LABEL: name: call_test_byval_6Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS6, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_6Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 16
-; ASM32-NEXT:  bl .test_byval_6Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_6Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S7 = type { [7 x i8] }
 
 @gS7 = external global %struct.S7, align 1
 
 define void @call_test_byval_7Byte() {
+; 32BIT-LABEL: call_test_byval_7Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C7(2) # @gS7
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 6(3)
+; 32BIT-NEXT:    lhz 5, 4(3)
+; 32BIT-NEXT:    rlwinm 4, 4, 8, 16, 23
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    rlwimi 4, 5, 16, 0, 15
+; 32BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_7Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C7(2) # @gS7
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 5, 6(3)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 6, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 5, 8, 16, 23
+; 64BIT-NEXT:    rlwimi 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 6, 32, 0
+; 64BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
   ret void
@@ -582,62 +422,39 @@ entry:
 
 declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
 
-; CHECK-LABEL: name: call_test_byval_7Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS7, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 6, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_7Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 6([[REGADDR]])
-; ASM32-DAG:   rlwinm 4, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 4, [[REG1]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_7Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 6([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_7Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S8 = type { [8 x i8] }
 
 @gS8 = external global %struct.S8, align 1
 
 define void @call_test_byval_8Byte() {
+; 32BIT-LABEL: call_test_byval_8Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C8(2) # @gS8
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_8Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C8(2) # @gS8
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
   ret void
@@ -645,102 +462,75 @@ entry:
 
 declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
 
-; CHECK-LABEL: name: call_test_byval_8Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS8, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_8Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_8Byte[PR]
-; ASM32-NEXT:  nop
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 128(1)
-; ASM64-NEXT:  ld 3, 0([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_8Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S32 = type { [32 x i8] }
 
 @gS32 = external global %struct.S32, align 1
 
 define void @call_test_byval_32Byte() {
+; 32BIT-LABEL: call_test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C9(2) # @gS32
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 10, 28(3)
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_32Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C9(2) # @gS32
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 6, 24(3)
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_32Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_32Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS32, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r10 = LWZ 28, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_32Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lwz 10, 28([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_32Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS32, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x6 = LD 24, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   ld 6, 24([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_32Byte
-; ASM64-NEXT:  nop
-
 define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+; 32BIT-LABEL: test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    lbz 3, 45(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 69(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
   %0 = load i8, ptr %arrayidx, align 1
@@ -750,200 +540,127 @@ entry:
 ; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
 ; details on why.
 
-; CHECK-LABEL: name:            test_byval_32Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT:        STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT:        renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT:        STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-DAG:    STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 64BIT-DAG:    STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_32Byte:
-
-; ASM32:       stw 8, 44(1)
-; ASM32:       stw 3, 24(1)
-; ASM32-DAG:   lbz 3, 45(1)
-; ASM32-DAG:   stw 4, 28(1)
-; ASM32-DAG:   stw 5, 32(1)
-; ASM32-DAG:   stw 6, 36(1)
-; ASM32-DAG:   stw 7, 40(1)
-; ASM32-DAG:   stw 9, 48(1)
-; ASM32-DAG:   stw 10, 52(1)
-; ASM32-NEXT:  blr
-
-; ASM64:       std 5, 64(1)
-; ASM64:       std 3, 48(1)
-; ASM64-DAG:   lbz 3, 69(1)
-; ASM64-DAG:   std 4, 56(1)
-; ASM64-DAG:   std 6, 72(1)
-; ASM64-NEXT:  blr
-
 %struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
 
 @gS31 = external global %struct.S31, align 1
 
 define void @call_test_byval_31Byte() {
+; 32BIT-LABEL: call_test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C10(2) # @gS31
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 10, 30(3)
+; 32BIT-NEXT:    lhz 11, 28(3)
+; 32BIT-NEXT:    rlwinm 10, 10, 8, 16, 23
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    rlwimi 10, 11, 16, 0, 15
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_31Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C10(2) # @gS31
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 6, 30(3)
+; 64BIT-NEXT:    lhz 7, 28(3)
+; 64BIT-NEXT:    rlwinm 6, 6, 8, 16, 23
+; 64BIT-NEXT:    lwz 8, 24(3)
+; 64BIT-NEXT:    rlwimi 6, 7, 16, 0, 15
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    rldimi 6, 8, 32, 0
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_31Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_31Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS31, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG:[0-9]+]] = LHZ 28, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r10 = LBZ 30, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_31Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lbz 10, 30([[REGADDR]])
-; ASM32-DAG:   lhz [[REG:[0-9]+]], 28([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, 10, 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_31Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS31, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 24, renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 28, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 30, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x6 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 24([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 28([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 30([[REGADDR]])
-; ASM64-DAG:   rlwinm 6, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 6, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 6, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_31Byte
-; ASM64-NEXT:  nop
-
-
-
 define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+; 32BIT-LABEL: test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    lfd 1, 40(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lfd 1, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
   %load = load double, ptr %gep, align 1
   ret double %load
 }
 
-; CHECK-LABEL: name:            test_byval_31Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT-DAG:    STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT-NEXT:   BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT-DAG:    STD killed renamable $x3,  0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-DAG:    STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 64BIT-DAG:    STD killed renamable $x4,  8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; ASM32-LABEL: .test_byval_31Byte:
-
-; ASM32-DAG:      stw 8, 44(1)
-; ASM32:          stw 7, 40(1)
-; ASM32-DAG:      lfd 1, 40(1)
-; ASM32-DAG:      stw 3, 24(1)
-; ASM32-DAG:      stw 4, 28(1)
-; ASM32-DAG:      stw 5, 32(1)
-; ASM32-DAG:      stw 6, 36(1)
-; ASM32-DAG:      stw 9, 48(1)
-; ASM32-DAG:      stw 10, 52(1)
-; ASM32-NEXT:     blr
-
-; ASM64:          std 5, 64(1)
-; ASM64:          lfd 1, 64(1)
-; ASM64-DAG:      std 3, 48(1)
-; ASM64-DAG:      std 4, 56(1)
-; ASM64-DAG:      std 6, 72(1)
-; ASM64-NEXT:     blr
-
 %struct.F = type { float, float, float }
 
 define i32 @call_test_byval_homogeneous_float_struct() {
+; 32BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    li 3, 0
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stw 3, 72(1)
+; 32BIT-NEXT:    stw 3, 68(1)
+; 32BIT-NEXT:    lwz 5, 72(1)
+; 32BIT-NEXT:    lwz 4, 68(1)
+; 32BIT-NEXT:    stw 3, 64(1)
+; 32BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    stw 3, 120(1)
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    lwz 3, 120(1)
+; 64BIT-NEXT:    sldi 4, 3, 32
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s = alloca %struct.F, align 8
   call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
@@ -954,37 +671,3 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
 
 declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
-
-; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
-; 32BIT-DAG:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 32BIT-DAG:   $r3 = LI 0
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-DAG:   lwz 4, 68(1)
-; ASM32-DAG:   lwz 5, 72(1)
-; ASM32-DAG:   stw 3, 64(1)
-; ASM32-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT:       renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 64BIT-NEXT:  renamable $x4 = RLDICR killed renamable $x3, 32, 31
-; 64BIT-NEXT:  $x3 = LI8 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64:       lwz 3, 120(1)
-; ASM64-NEXT:  sldi 4, 3, 32
-; ASM64-NEXT:  li 3, 0
-; ASM64-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM64-NEXT:  nop
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
index 472be4fa63643..4697a093e5d6d 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
@@ -8,123 +8,124 @@
 ; RUN: FileCheck --check-prefix=64BIT %s
 
 define <4 x i32> @caller() {
+
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 48
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 32
-  ; 32BIT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI2:%[0-9]+]]:gprc = LI 160
-  ; 32BIT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI3:%[0-9]+]]:gprc = LI 144
-  ; 32BIT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI4:%[0-9]+]]:gprc = LI 128
-  ; 32BIT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI5:%[0-9]+]]:gprc = LI 112
-  ; 32BIT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI6:%[0-9]+]]:gprc = LI 96
-  ; 32BIT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI7:%[0-9]+]]:gprc = LI 80
-  ; 32BIT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI8:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
-  ; 32BIT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
-  ; 32BIT:   [[LI9:%[0-9]+]]:gprc = LI 9
-  ; 32BIT:   $r3 = COPY [[LI9]]
-  ; 32BIT:   $r5 = COPY [[LWZ5]]
-  ; 32BIT:   $r6 = COPY [[LWZ4]]
-  ; 32BIT:   $r7 = COPY [[LWZ3]]
-  ; 32BIT:   $r8 = COPY [[LWZ2]]
-  ; 32BIT:   $r9 = COPY [[LWZ1]]
-  ; 32BIT:   $r10 = COPY [[LWZ]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   $v2 = COPY [[COPY]]
-  ; 32BIT:   BLR implicit $lr, implicit $rm, implicit $v2
-
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 48
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 32
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI2:%[0-9]+]]:gprc = LI 160
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI3:%[0-9]+]]:gprc = LI 144
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI4:%[0-9]+]]:gprc = LI 128
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI5:%[0-9]+]]:gprc = LI 112
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI6:%[0-9]+]]:gprc = LI 96
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI7:%[0-9]+]]:gprc = LI 80
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI8:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LI9:%[0-9]+]]:gprc = LI 9
+  ; 32BIT-NEXT:   $r3 = COPY [[LI9]]
+  ; 32BIT-NEXT:   $r5 = COPY [[LWZ5]]
+  ; 32BIT-NEXT:   $r6 = COPY [[LWZ4]]
+  ; 32BIT-NEXT:   $r7 = COPY [[LWZ3]]
+  ; 32BIT-NEXT:   $r8 = COPY [[LWZ2]]
+  ; 32BIT-NEXT:   $r9 = COPY [[LWZ1]]
+  ; 32BIT-NEXT:   $r10 = COPY [[LWZ]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $v2
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
-  ; 64BIT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
-  ; 64BIT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
-  ; 64BIT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
-  ; 64BIT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
-  ; 64BIT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
-  ; 64BIT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
-  ; 64BIT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
-  ; 64BIT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
-  ; 64BIT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
-  ; 64BIT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
-  ; 64BIT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
-  ; 64BIT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
-  ; 64BIT:   $x3 = COPY [[LI8_9]]
-  ; 64BIT:   $x5 = COPY [[LD5]]
-  ; 64BIT:   $x6 = COPY [[LD4]]
-  ; 64BIT:   $x7 = COPY [[LD3]]
-  ; 64BIT:   $x8 = COPY [[LD2]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   $v2 = COPY [[COPY]]
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_9]]
+  ; 64BIT-NEXT:   $x5 = COPY [[LD5]]
+  ; 64BIT-NEXT:   $x6 = COPY [[LD4]]
+  ; 64BIT-NEXT:   $x7 = COPY [[LD3]]
+  ; 64BIT-NEXT:   $x8 = COPY [[LD2]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
   entry:
     %call = tail call <4 x i32> (i32, ...) @callee(i32 9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>, <4 x i32> <i32 12, i32 13, i32 14, i32 15>, <4 x i32> <i32 16, i32 17, i32 18, i32 19>, <4 x i32> <i32 20, i32 21, i32 22, i32 23>, <4 x i32> <i32 24, i32 25, i32 26, i32 27>, <4 x i32> <i32 28, i32 29, i32 30, i32 31>, <4 x i32> <i32 32, i32 33, i32 34, i32 35>)
       ret <4 x i32> %call
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
index f3e58b7897948..fad275f58cd01 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
@@ -12,76 +12,77 @@ define void @caller() {
 
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
-  ; 32BIT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
-  ; 32BIT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
-  ; 32BIT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
-  ; 32BIT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
-  ; 32BIT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
-  ; 32BIT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
-  ; 32BIT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 55
-  ; 32BIT:   $r3 = COPY [[LI1]]
-  ; 32BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 32BIT:   $f1 = COPY [[LFD]]
-  ; 32BIT:   $r9 = COPY [[ORI2]]
-  ; 32BIT:   $r10 = COPY [[ORI3]]
-  ; 32BIT:   $f2 = COPY [[LFD1]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   BLR implicit $lr, implicit $rm
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
+  ; 32BIT-NEXT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
+  ; 32BIT-NEXT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
+  ; 32BIT-NEXT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
+  ; 32BIT-NEXT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
+  ; 32BIT-NEXT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
+  ; 32BIT-NEXT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
+  ; 32BIT-NEXT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 55
+  ; 32BIT-NEXT:   $r3 = COPY [[LI1]]
+  ; 32BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 32BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 32BIT-NEXT:   $r9 = COPY [[ORI2]]
+  ; 32BIT-NEXT:   $r10 = COPY [[ORI3]]
+  ; 32BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
-  ; 64BIT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
-  ; 64BIT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
-  ; 64BIT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
-  ; 64BIT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
-  ; 64BIT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
-  ; 64BIT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
-  ; 64BIT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
-  ; 64BIT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
-  ; 64BIT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
-  ; 64BIT:   $x3 = COPY [[LI8_1]]
-  ; 64BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 64BIT:   $f1 = COPY [[LFD]]
-  ; 64BIT:   $x7 = COPY [[ORI8_3]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   $f2 = COPY [[LFD1]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
+  ; 64BIT-NEXT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
+  ; 64BIT-NEXT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
+  ; 64BIT-NEXT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
+  ; 64BIT-NEXT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
+  ; 64BIT-NEXT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
+  ; 64BIT-NEXT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
+  ; 64BIT-NEXT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_1]]
+  ; 64BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 64BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 64BIT-NEXT:   $x7 = COPY [[ORI8_3]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
 entry:
   %call = tail call <4 x i32> (i32, <4 x i32>, double, ...) @callee(i32 signext 55, <4 x i32> <i32 170, i32 187, i32 204, i32 221>, double 3.141590e+00, <4 x i32> <i32 10, i32 20, i32 30, i32 40>, double 2.718280e+00)
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index decc4a38f7ccd..f9c953d483ff2 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1403,3 +1403,448 @@ entry:
   ret fp128 %3
 }
 declare { fp128, i32 } @llvm.frexp.f128.i32(fp128)
+
+
+define dso_local fp128 @acos_f128(fp128 %x) {
+; CHECK-LABEL: acos_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl acosf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: acos_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl acosf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.acos.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @asin_f128(fp128 %x) {
+; CHECK-LABEL: asin_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl asinf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: asin_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl asinf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.asin.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan_f128(fp128 %x) {
+; CHECK-LABEL: atan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atanf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atanf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: atan2_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atan2f128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan2_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atan2f128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan2.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @copysign_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: copysign_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xscpsgnqp v2, v3, v2
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: copysign_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    addi r3, r1, -16
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    addi r3, r1, -32
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lbz r4, -1(r1)
+; CHECK-P8-NEXT:    lbz r5, -17(r1)
+; CHECK-P8-NEXT:    rlwimi r5, r4, 0, 0, 24
+; CHECK-P8-NEXT:    stb r5, -17(r1)
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.copysign.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @cosh_f128(fp128 %x) {
+; CHECK-LABEL: cosh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl coshf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: cosh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl coshf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.cosh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @exp10_f128(fp128 %x) {
+; CHECK-LABEL: exp10_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl exp10f128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: exp10_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl exp10f128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %result
+}
+
+; FIXME: Asserts
+; define dso_local fp128 @maximum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @maximumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
+; CHECK-LABEL: ldexp_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    extsw r5, r5
+; CHECK-NEXT:    bl ldexpf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: ldexp_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    extsw r5, r5
+; CHECK-P8-NEXT:    bl ldexpf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.ldexp.f128.i32(fp128 %x, i32 %y)
+  ret fp128 %result
+}
+
+define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
+; CHECK-LABEL: modf_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modff128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lxv v3, 32(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: modf_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    .cfi_offset r30, -16
+; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    stdu r1, -64(r1)
+; CHECK-P8-NEXT:    addi r30, r1, 32
+; CHECK-P8-NEXT:    std r0, 80(r1)
+; CHECK-P8-NEXT:    mr r5, r30
+; CHECK-P8-NEXT:    bl modff128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    addi r1, r1, 64
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call { fp128, fp128 } @llvm.modf.f128(fp128 %x)
+  ret { fp128, fp128 } %result
+}
+
+define dso_local fp128 @roundeven_f128(fp128 %x) {
+; CHECK-LABEL: roundeven_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl roundevenf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: roundeven_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl roundevenf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.roundeven.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @sinh_f128(fp128 %x) {
+; CHECK-LABEL: sinh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl sinhf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: sinh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl sinhf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.sinh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tanh_f128(fp128 %x) {
+; CHECK-LABEL: tanh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanhf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tanh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanhf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tanh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tan_f128(fp128 %x) {
+; CHECK-LABEL: tan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanf128
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanf128
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tan.f128(fp128 %x)
+  ret fp128 %result
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
index 2ad068eb7dc3d..49276c9416234 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   i64)
 
-; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_arg
 define <vscale x 1 x i8> @scalable_arg(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
 entry:
@@ -22,7 +22,7 @@ entry:
   ret <vscale x 1 x i8> %a
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_inst
 define <vscale x 1 x i8> @scalable_inst(i64 %0) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 694662eab1681..8714b286374a5 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:       Insert fentry calls
 ; CHECK-NEXT:       Insert XRay ops
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -73,7 +74,6 @@
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 19de864422bc5..c7f70a9d266c2 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -195,6 +195,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       RISC-V Late Branch Optimisation Pass
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -210,7 +211,6 @@
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISC-V Zcmp move merging pass
 ; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index ba8969b5a5382..c5188aa1918bf 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -106,6 +106,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisync %s -o - | FileCheck --check-prefix=RV32XQCISYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s
+; RUN: llc -mtriple=riscv32 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV32XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
@@ -260,6 +261,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s
+; RUN: llc -mtriple=riscv64 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV64XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s
@@ -437,7 +439,7 @@
 ; RV32XTHEADMEMPAIR: .attribute 5, "rv32i2p1_xtheadmempair1p0"
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
-; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p1"
+; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p3"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p7"
 ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p3"
 ; RV32XQCIBI: .attribute 5, "rv32i2p1_zca1p0_xqcibi0p2"
@@ -445,18 +447,19 @@
 ; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p3"
 ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
-; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p3"
-; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p7"
+; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p4"
+; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p10"
 ; RV32XQCIIO: .attribute 5, "rv32i2p1_xqciio0p1"
 ; RV32XQCILB: .attribute 5, "rv32i2p1_zca1p0_xqcilb0p2"
 ; RV32XQCILI: .attribute 5, "rv32i2p1_zca1p0_xqcili0p2"
 ; RV32XQCILIA: .attribute 5, "rv32i2p1_zca1p0_xqcilia0p2"
 ; RV32XQCILO: .attribute 5, "rv32i2p1_zca1p0_xqcilo0p3"
-; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p5"
+; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p6"
 ; RV32XQCISIM: attribute 5, "rv32i2p1_zca1p0_xqcisim0p2"
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
 ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0"
+; RV32XANDESVBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV32XANDESVPACKFPH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
@@ -612,6 +615,7 @@
 ; RV64XTHEADSYNC: .attribute 5, "rv64i2p1_xtheadsync1p0"
 ; RV64XTHEADVDOT: .attribute 5, "rv64i2p1_f2p2_d2p2_v1p0_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xtheadvdot1p0"
 ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0"
+; RV64XANDESVBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV64XANDESVPACKFPH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
@@ -683,7 +687,7 @@
 ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0"
 ; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
 ; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
-; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p1"
+; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p3"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index fab2e94959301..8b931f70aa5cc 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -174,6 +174,7 @@
 ; CHECK-NEXT:   ventana-veyron                   - Ventana Veyron-Series processors.
 ; CHECK-NEXT:   vxrm-pipeline-flush              - VXRM writes causes pipeline flush.
 ; CHECK-NEXT:   xandesperf                       - 'XAndesPerf' (Andes Performance Extension).
+; CHECK-NEXT:   xandesvbfhcvt                    - 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension).
 ; CHECK-NEXT:   xandesvdot                       - 'XAndesVDot' (Andes Vector Dot Product Extension).
 ; CHECK-NEXT:   xandesvpackfph                   - 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension).
 ; CHECK-NEXT:   xcvalu                           - 'XCValu' (CORE-V ALU Operations).
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 83637e4a71d45..7c9a283dd54bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -290,8 +290,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_6(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -314,8 +313,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_22(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_22:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
@@ -341,9 +339,9 @@ define <vscale x 1 x i8> @extract_nxv4i8_nxv1i8_3(<vscale x 4 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv4i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    ret
@@ -490,8 +488,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
@@ -545,8 +541,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v13, v10, a0
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index bd0fecd285515..aea688f03cf72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -257,9 +257,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB22_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -270,9 +270,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB22_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
@@ -286,9 +286,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_XLen:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -299,9 +299,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB23_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index ca9cec921b3cd..22ddd4f8a95d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -78,12 +78,11 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-LABEL: insert_nxv1i8_nxv4i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 4 x i8> %v
@@ -246,8 +245,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -282,8 +280,8 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_1(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-LABEL: insert_nxv16i8_nxv1i8_1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
@@ -309,10 +307,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
@@ -363,8 +361,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -376,8 +373,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
@@ -422,8 +418,8 @@ define <vscale x 32 x i1> @insert_nxv32i1_nxv8i1_8(<vscale x 32 x i1> %v, <vscal
 ; CHECK-LABEL: insert_nxv32i1_nxv8i1_8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
@@ -570,8 +566,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_2(<vscale x 32 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 2)
@@ -583,8 +578,7 @@ define <vscale x 32 x bfloat> @insert_nxv32bf16_nxv2bf16_26(<vscale x 32 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vector.insert.nxv2bf16.nxv32bf16(<vscale x 32 x bfloat> %vec, <vscale x 2 x bfloat> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index e9e1303d10768..f847ccafefdaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 3 x i8> @load_nxv3i8(ptr %ptr) {
 ; CHECK-LABEL: load_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -22,9 +22,9 @@ define <vscale x 5 x half> @load_nxv5f16(ptr %ptr) {
 ; CHECK-LABEL: load_nxv5f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 2
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 77438ee53b634..03b84ec177ee9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -8,9 +8,9 @@ define void @store_nxv3i8(<vscale x 3 x i8> %val, ptr %ptr) {
 ; CHECK-LABEL: store_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 28b27bb75f210..9972df97ad9f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1371,6 +1371,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -1378,9 +1380,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1406,6 +1407,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    mv t1, t0
 ; CHECK-NEXT:    slli t0, t0, 2
+; CHECK-NEXT:    add t1, t1, t0
+; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    add t0, t0, t1
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
@@ -1413,9 +1416,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a1
 ; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv t0, a0
 ; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add t0, t0, a0
+; CHECK-NEXT:    mv t0, a0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, t0
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1445,10 +1447,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1457,85 +1455,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v24, v16, v0.t
-; CHECK-NEXT:    add a0, a3, a3
+; CHECK-NEXT:    vmfeq.vv v7, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a5
 ; CHECK-NEXT:  .LBB85_4:
-; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a3
-; CHECK-NEXT:    sltu a6, a2, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add a7, a7, a6
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    add a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    sub a0, a2, a4
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    slli a5, a5, 1
-; CHECK-NEXT:    add a6, a6, a5
+; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    slli a5, a5, 2
+; CHECK-NEXT:    add a6, a6, a5
+; CHECK-NEXT:    slli a5, a5, 1
+; CHECK-NEXT:    add a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a3
+; CHECK-NEXT:    sltu a5, a2, a0
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a0, a5, a0
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 1
+; CHECK-NEXT:    mv a6, a5
+; CHECK-NEXT:    slli a5, a5, 3
 ; CHECK-NEXT:    add a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a5, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v4, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v6, v5, a3
+; CHECK-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; CHECK-NEXT:    vmv1r.v v9, v7
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v5, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a4, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v4, a3
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v6, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a3
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    mv a1, a0
@@ -3546,8 +3554,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v16, v24, v8, v0.t
-; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v6, a1
 ; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
@@ -3576,6 +3583,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -3583,9 +3592,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -3611,6 +3619,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    mv t1, t0
 ; ZVFHMIN-NEXT:    slli t0, t0, 2
+; ZVFHMIN-NEXT:    add t1, t1, t0
+; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    add t0, t0, t1
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
@@ -3618,9 +3628,8 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vslidedown.vx v16, v8, a1
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv t0, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add t0, t0, a0
+; ZVFHMIN-NEXT:    mv t0, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -3650,10 +3659,6 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v5, v8, v16, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -3662,85 +3667,95 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v16, v0.t
-; ZVFHMIN-NEXT:    add a0, a3, a3
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB171_4:
-; ZVFHMIN-NEXT:    sub a5, a2, a4
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
-; ZVFHMIN-NEXT:    add a6, a6, a7
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # vscale x 8-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    add a7, a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 3
-; ZVFHMIN-NEXT:    add a6, a6, a7
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a5, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    sub a0, a2, a4
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 2
+; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    add a5, a5, a6
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a5) # vscale x 8-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    sltu a5, a2, a0
+; ZVFHMIN-NEXT:    addi a5, a5, -1
+; ZVFHMIN-NEXT:    and a0, a5, a0
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 3
 ; ZVFHMIN-NEXT:    add a5, a5, a6
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a5, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v4, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v5, a3
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v9, v7
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v9, v5, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a5, a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, a0, a4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # vscale x 64-byte Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # vscale x 8-byte Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
-; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v10, a3
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    mv a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index ae868fed68cab..ff923efe8eb43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -4280,8 +4280,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV32-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -4293,8 +4292,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
 ; RV64-NEXT:    srli a0, a0, 3
-; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -4306,8 +4304,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN32-NEXT:    vmfeq.vf v0, v8, fa5
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
-; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -4319,8 +4316,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN64-NEXT:    vmfeq.vf v0, v8, fa5
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
-; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index ef560a7631dee..13c63d9c80a9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2246,8 +2246,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v6, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2283,8 +2282,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2316,8 +2314,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index bd3c29b0c6efc..a85b471530cc9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3001,9 +3001,8 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index d4e2c08d70d3d..95c1292e41927 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -637,21 +637,21 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV32-NEXT:    lui a1, 797989
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    lui a3, 11557
-; RV32-NEXT:    lui a4, 92455
 ; RV32-NEXT:    addi a1, a1, -683
-; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    srli a4, a2, 2
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    srli a0, a2, 3
-; RV32-NEXT:    addi a1, a4, -1368
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    mulhu a1, a0, a1
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    slli a0, a0, 6
-; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    slli a0, a2, 3
+; RV32-NEXT:    sub a0, a0, a4
+; RV32-NEXT:    lui a1, 92455
+; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    mul a3, a2, a3
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    addi a1, a1, -1368
+; RV32-NEXT:    mulhu a1, a2, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a3, 0(sp)
 ; RV32-NEXT:    sw a0, 4(sp)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
index c9f9a79733003..790cd56ee952c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
@@ -48,10 +48,10 @@ define internal void @SubRegLivenessUndefInPhi(i64 %cond) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vadd.vi v10, v9, 1
 ; CHECK-NEXT:    vadd.vi v11, v9, 3
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index aef46e1f5cf1b..66e114c938c06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2240,20 +2240,19 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-RV32:       # %bb.0: # %entry
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    srli a3, a4, 3
-; CHECK-RV32-NEXT:    li a2, 64
+; CHECK-RV32-NEXT:    srli a2, a4, 3
+; CHECK-RV32-NEXT:    li a3, 64
 ; CHECK-RV32-NEXT:    not a1, a1
-; CHECK-RV32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-RV32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    li a3, 0
 ; CHECK-RV32-NEXT:    li a2, 0
 ; CHECK-RV32-NEXT:    j .LBB98_5
 ; CHECK-RV32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-RV32-NEXT:    li a2, 0
-; CHECK-RV32-NEXT:    slli a3, a3, 2
-; CHECK-RV32-NEXT:    neg a3, a3
-; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    srli a4, a4, 1
+; CHECK-RV32-NEXT:    neg a3, a4
+; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    li a6, 0
 ; CHECK-RV32-NEXT:    li a5, 0
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2300,10 +2299,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV64-NEXT:    li a2, 0
 ; CHECK-RV64-NEXT:    j .LBB98_5
 ; CHECK-RV64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-RV64-NEXT:    slli a2, a2, 2
-; CHECK-RV64-NEXT:    negw a2, a2
-; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    srli a3, a4, 1
+; CHECK-RV64-NEXT:    negw a2, a3
+; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    slli a4, a4, 1
 ; CHECK-RV64-NEXT:    mv a5, a0
 ; CHECK-RV64-NEXT:    mv a6, a2
@@ -2335,19 +2333,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-NOZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-NOZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-NOZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-NOZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-NOZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-NOZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2395,10 +2392,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-NOZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a6, a2
@@ -2431,19 +2427,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-ZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-ZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-ZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-ZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-ZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-ZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-ZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-ZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-ZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-ZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-ZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2489,10 +2484,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-ZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a6, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index ca7f2563e4fc9..4753ab915bdf3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -191,8 +191,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
@@ -222,8 +221,7 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave4_v2i32_
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -254,15 +252,13 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v12, a1
-; CHECK-NEXT:    vslideup.vx v8, v14, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v13, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v13, v12, a0
+; CHECK-NEXT:    vslideup.vx v8, v14, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -292,16 +288,14 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v15, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vslideup.vx v12, v10, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a0
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v12
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -330,24 +324,19 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 4
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    add a4, a2, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a2
+; CHECK-NEXT:    add a3, a1, a2
+; CHECK-NEXT:    vslideup.vx v8, v12, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    slli a3, a1, 1
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    vslideup.vx v8, v13, a2
-; CHECK-NEXT:    add a2, a0, a0
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v14, a3
-; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -376,25 +365,20 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 4
 ; CHECK-NEXT:    vslidedown.vi v15, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    add a4, a2, a1
-; CHECK-NEXT:    slli a5, a1, 1
-; CHECK-NEXT:    add a6, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a2
+; CHECK-NEXT:    add a3, a1, a2
+; CHECK-NEXT:    vslideup.vx v8, v13, a2
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    add a5, a5, a1
-; CHECK-NEXT:    vslideup.vx v8, v13, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    add a1, a5, a1
-; CHECK-NEXT:    vslideup.vx v8, v14, a2
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a5
-; CHECK-NEXT:    vslideup.vx v8, v15, a5
-; CHECK-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
+; CHECK-NEXT:    vslideup.vx v8, v14, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a3
+; CHECK-NEXT:    vslideup.vx v8, v15, a3
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -555,8 +539,7 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 4
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
@@ -590,8 +573,7 @@ define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
 ; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -626,15 +608,13 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v13, v12, a1
-; CHECK-NEXT:    vslideup.vx v8, v14, a1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v13, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v13, v12, a0
+; CHECK-NEXT:    vslideup.vx v8, v14, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v10
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -668,16 +648,14 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>}
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 8
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v15, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vslideup.vx v12, v10, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a0
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v12
 ; CHECK-NEXT:    vs2r.v v8, (a0)
@@ -711,21 +689,18 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vslidedown.vi v13, v8, 5
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 6
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v9, a1
-; CHECK-NEXT:    vslideup.vx v10, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v11, a0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v9, a0
+; CHECK-NEXT:    vslideup.vx v10, v12, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a1
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 4
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v13, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v14, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v13, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v14, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs2r.v v10, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
@@ -755,25 +730,22 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 7
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 6
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 5
-; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    add a3, a0, a0
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v9, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a0
+; CHECK-NEXT:    vslideup.vx v9, v12, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a1
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 3
 ; CHECK-NEXT:    vslidedown.vi v11, v8, 2
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 1
-; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v11, v10, a1
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 6a08f5a28a295..6144f916ea52b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -477,27 +477,26 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -606,11 +605,9 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
@@ -836,39 +833,37 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
-; CHECK-NEXT:    srli a2, a0, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    srli a3, a0, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a3
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a2)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v16, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg6e8.v v16, (a2)
 ; CHECK-NEXT:    vlseg6e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v16
 ; CHECK-NEXT:    vmv2r.v v22, v18
@@ -1068,36 +1063,35 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
-; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v22, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    vmv1r.v v10, v15
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v11, v24
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v23
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v14
@@ -1339,49 +1333,48 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v18, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v18, (a2)
 ; CHECK-NEXT:    vlseg8e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v18
 ; CHECK-NEXT:    vmv2r.v v26, v20
@@ -2712,16 +2705,10 @@ define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
@@ -2808,16 +2795,10 @@ define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vs
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
@@ -2904,16 +2885,10 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v10, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 3751967f18aa4..a5811e697634a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -14,18 +14,17 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    li a1, -1
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    vwaddu.vv v8, v14, v12
 ; CHECK-NEXT:    vwmaccu.vx v8, a1, v12
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmsne.vi v12, v10, 0
 ; CHECK-NEXT:    vmsne.vi v10, v8, 0
-; CHECK-NEXT:    add a1, a2, a2
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a2
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v10, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e297e88c71f1b..01cc5c58b24ce 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -17,18 +17,17 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; V-NEXT:    vmv1r.v v0, v8
 ; V-NEXT:    vmv.v.i v10, 0
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vmerge.vim v12, v10, 1, v0
 ; V-NEXT:    vmv1r.v v0, v9
 ; V-NEXT:    vmerge.vim v14, v10, 1, v0
-; V-NEXT:    srli a1, a1, 2
 ; V-NEXT:    vwaddu.vv v8, v14, v12
 ; V-NEXT:    vwmaccu.vx v8, a0, v12
+; V-NEXT:    csrr a0, vlenb
 ; V-NEXT:    vmsne.vi v12, v10, 0
 ; V-NEXT:    vmsne.vi v0, v8, 0
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; V-NEXT:    vslideup.vx v0, v12, a1
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; V-NEXT:    vslideup.vx v0, v12, a0
 ; V-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
@@ -38,17 +37,16 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmv.v.i v10, 0
 ; ZVBB-NEXT:    li a0, 1
-; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vwsll.vi v12, v10, 8
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vwaddu.wx v12, v12, a0, v0.t
+; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    srli a0, a0, 2
+; ZVBB-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave_nxv32i1_nxv16i1:
@@ -61,13 +59,12 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZIP-NEXT:    vmerge.vim v12, v10, 1, v0
 ; ZIP-NEXT:    vmv1r.v v0, v9
 ; ZIP-NEXT:    vmerge.vim v8, v10, 1, v0
-; ZIP-NEXT:    srli a0, a0, 2
 ; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
 ; ZIP-NEXT:    ri.vzip2a.vv v14, v8, v12
 ; ZIP-NEXT:    vmsne.vi v8, v10, 0
 ; ZIP-NEXT:    vmsne.vi v0, v14, 0
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT:    srli a0, a0, 2
+; ZIP-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZIP-NEXT:    vslideup.vx v0, v8, a0
 ; ZIP-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -508,19 +505,17 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsseg3e8.v v14, (a0)
 ; CHECK-NEXT:    vl2r.v v8, (a2)
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    vl2r.v v10, (a3)
 ; CHECK-NEXT:    vl2r.v v12, (a0)
-; CHECK-NEXT:    add a0, a2, a2
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vmsne.vi v14, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v14, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v14, a2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 6
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -551,19 +546,17 @@ define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsseg3e8.v v14, (a0)
 ; ZVBB-NEXT:    vl2r.v v8, (a2)
-; ZVBB-NEXT:    srli a2, a1, 2
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a2, a1, 1
 ; ZVBB-NEXT:    vl2r.v v10, (a3)
 ; ZVBB-NEXT:    vl2r.v v12, (a0)
-; ZVBB-NEXT:    add a0, a2, a2
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vmsne.vi v14, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v14, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v14, a2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 6
 ; ZVBB-NEXT:    mul a0, a0, a1
@@ -812,22 +805,20 @@ define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a2, a4, a2
 ; CHECK-NEXT:    vsseg4e8.v v14, (a0)
 ; CHECK-NEXT:    vl2r.v v8, (a2)
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl2r.v v10, (a4)
-; CHECK-NEXT:    add a4, a2, a2
 ; CHECK-NEXT:    vl2r.v v12, (a3)
 ; CHECK-NEXT:    vl2r.v v14, (a0)
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    vmsne.vi v9, v12, 0
 ; CHECK-NEXT:    vmsne.vi v0, v14, 0
-; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a2
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslideup.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -859,22 +850,20 @@ define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a2, a4, a2
 ; ZVBB-NEXT:    vsseg4e8.v v14, (a0)
 ; ZVBB-NEXT:    vl2r.v v8, (a2)
-; ZVBB-NEXT:    srli a2, a1, 2
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl2r.v v10, (a4)
-; ZVBB-NEXT:    add a4, a2, a2
 ; ZVBB-NEXT:    vl2r.v v12, (a3)
 ; ZVBB-NEXT:    vl2r.v v14, (a0)
 ; ZVBB-NEXT:    vmsne.vi v16, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v9, v12, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v14, 0
-; ZVBB-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v16, a2
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v16, a1
+; ZVBB-NEXT:    vslideup.vx v0, v9, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 3
 ; ZVBB-NEXT:    add sp, sp, a0
@@ -1114,7 +1103,7 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    add a2, a4, a1
-; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    srli a3, a1, 1
 ; CHECK-NEXT:    vmv2r.v v20, v14
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
@@ -1144,11 +1133,9 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vl1r.v v16, (a5)
 ; CHECK-NEXT:    add a5, a5, a1
-; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl1r.v v11, (a2)
-; CHECK-NEXT:    add a2, a3, a3
 ; CHECK-NEXT:    vl1r.v v15, (a4)
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vl1r.v v13, (a0)
 ; CHECK-NEXT:    vl1r.v v17, (a5)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -1156,11 +1143,11 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    vmsne.vi v9, v12, 0
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v18, a3
-; CHECK-NEXT:    vslideup.vx v9, v8, a3
-; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v18, a1
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v8, v16, 0
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1190,7 +1177,7 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmerge.vim v18, v12, 1, v0
 ; ZVBB-NEXT:    add a2, a4, a1
-; ZVBB-NEXT:    srli a3, a1, 2
+; ZVBB-NEXT:    srli a3, a1, 1
 ; ZVBB-NEXT:    vmv2r.v v20, v14
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vmerge.vim v16, v12, 1, v0
@@ -1220,11 +1207,9 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vl1r.v v16, (a5)
 ; ZVBB-NEXT:    add a5, a5, a1
-; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl1r.v v11, (a2)
-; ZVBB-NEXT:    add a2, a3, a3
 ; ZVBB-NEXT:    vl1r.v v15, (a4)
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vl1r.v v13, (a0)
 ; ZVBB-NEXT:    vl1r.v v17, (a5)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -1232,11 +1217,11 @@ define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmsne.vi v0, v10, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v9, v12, 0
-; ZVBB-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v18, a3
-; ZVBB-NEXT:    vslideup.vx v9, v8, a3
-; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v18, a1
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v8, v16, 0
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -2340,47 +2325,45 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmv1r.v v17, v9
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v24, v20, 1, v0
-; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    addi a4, sp, 16
 ; CHECK-NEXT:    vmv1r.v v18, v25
 ; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    vmerge.vim v26, v20, 1, v0
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmv1r.v v19, v27
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmerge.vim v10, v20, 1, v0
-; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vmv1r.v v20, v11
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vsseg6e8.v v15, (a0)
 ; CHECK-NEXT:    vmv1r.v v15, v22
-; CHECK-NEXT:    add a4, a5, a2
+; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vmv1r.v v16, v8
-; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    srli a3, a1, 1
 ; CHECK-NEXT:    vmv1r.v v17, v24
-; CHECK-NEXT:    add a6, a4, a2
+; CHECK-NEXT:    add a6, a5, a1
 ; CHECK-NEXT:    vmv1r.v v18, v26
-; CHECK-NEXT:    add a7, a3, a2
+; CHECK-NEXT:    add a7, a2, a1
 ; CHECK-NEXT:    vmv1r.v v19, v10
-; CHECK-NEXT:    vsseg6e8.v v14, (a5)
+; CHECK-NEXT:    vsseg6e8.v v14, (a4)
 ; CHECK-NEXT:    vl1r.v v8, (a0)
-; CHECK-NEXT:    add a0, a6, a2
+; CHECK-NEXT:    add a0, a6, a1
 ; CHECK-NEXT:    vl1r.v v10, (a6)
-; CHECK-NEXT:    add a6, a7, a2
-; CHECK-NEXT:    vl1r.v v12, (a5)
-; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a6, a7, a1
+; CHECK-NEXT:    vl1r.v v12, (a4)
+; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl1r.v v14, (a7)
-; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vl1r.v v16, (a5)
-; CHECK-NEXT:    add a5, a5, a2
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vl1r.v v16, (a4)
+; CHECK-NEXT:    add a4, a4, a1
 ; CHECK-NEXT:    vl1r.v v18, (a7)
-; CHECK-NEXT:    add a7, a7, a2
-; CHECK-NEXT:    srli a2, a2, 1
-; CHECK-NEXT:    vl1r.v v9, (a3)
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    vl1r.v v17, (a5)
-; CHECK-NEXT:    add a5, a2, a2
+; CHECK-NEXT:    add a7, a7, a1
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    vl1r.v v17, (a4)
 ; CHECK-NEXT:    vl1r.v v11, (a0)
-; CHECK-NEXT:    vl1r.v v13, (a4)
+; CHECK-NEXT:    vl1r.v v13, (a5)
 ; CHECK-NEXT:    vl1r.v v19, (a7)
 ; CHECK-NEXT:    vl1r.v v15, (a6)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -2390,12 +2373,12 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vmsne.vi v10, v18, 0
 ; CHECK-NEXT:    vmsne.vi v8, v14, 0
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v20, a1
 ; CHECK-NEXT:    vslideup.vx v0, v16, a1
-; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 12
@@ -2427,47 +2410,45 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmv1r.v v17, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v24, v20, 1, v0
-; ZVBB-NEXT:    addi a5, sp, 16
+; ZVBB-NEXT:    addi a4, sp, 16
 ; ZVBB-NEXT:    vmv1r.v v18, v25
 ; ZVBB-NEXT:    vmv1r.v v0, v11
 ; ZVBB-NEXT:    vmerge.vim v26, v20, 1, v0
-; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmv1r.v v19, v27
 ; ZVBB-NEXT:    vmv1r.v v0, v12
 ; ZVBB-NEXT:    vmerge.vim v10, v20, 1, v0
-; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    vmv1r.v v20, v11
-; ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; ZVBB-NEXT:    vsseg6e8.v v15, (a0)
 ; ZVBB-NEXT:    vmv1r.v v15, v22
-; ZVBB-NEXT:    add a4, a5, a2
+; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vmv1r.v v16, v8
-; ZVBB-NEXT:    srli a1, a2, 2
+; ZVBB-NEXT:    srli a3, a1, 1
 ; ZVBB-NEXT:    vmv1r.v v17, v24
-; ZVBB-NEXT:    add a6, a4, a2
+; ZVBB-NEXT:    add a6, a5, a1
 ; ZVBB-NEXT:    vmv1r.v v18, v26
-; ZVBB-NEXT:    add a7, a3, a2
+; ZVBB-NEXT:    add a7, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v19, v10
-; ZVBB-NEXT:    vsseg6e8.v v14, (a5)
+; ZVBB-NEXT:    vsseg6e8.v v14, (a4)
 ; ZVBB-NEXT:    vl1r.v v8, (a0)
-; ZVBB-NEXT:    add a0, a6, a2
+; ZVBB-NEXT:    add a0, a6, a1
 ; ZVBB-NEXT:    vl1r.v v10, (a6)
-; ZVBB-NEXT:    add a6, a7, a2
-; ZVBB-NEXT:    vl1r.v v12, (a5)
-; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    add a6, a7, a1
+; ZVBB-NEXT:    vl1r.v v12, (a4)
+; ZVBB-NEXT:    add a4, a0, a1
 ; ZVBB-NEXT:    vl1r.v v14, (a7)
-; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vl1r.v v16, (a5)
-; ZVBB-NEXT:    add a5, a5, a2
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vl1r.v v16, (a4)
+; ZVBB-NEXT:    add a4, a4, a1
 ; ZVBB-NEXT:    vl1r.v v18, (a7)
-; ZVBB-NEXT:    add a7, a7, a2
-; ZVBB-NEXT:    srli a2, a2, 1
-; ZVBB-NEXT:    vl1r.v v9, (a3)
-; ZVBB-NEXT:    add a3, a1, a1
-; ZVBB-NEXT:    vl1r.v v17, (a5)
-; ZVBB-NEXT:    add a5, a2, a2
+; ZVBB-NEXT:    add a7, a7, a1
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    vl1r.v v17, (a4)
 ; ZVBB-NEXT:    vl1r.v v11, (a0)
-; ZVBB-NEXT:    vl1r.v v13, (a4)
+; ZVBB-NEXT:    vl1r.v v13, (a5)
 ; ZVBB-NEXT:    vl1r.v v19, (a7)
 ; ZVBB-NEXT:    vl1r.v v15, (a6)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
@@ -2477,12 +2458,12 @@ define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
 ; ZVBB-NEXT:    vmsne.vi v10, v18, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v20, a1
 ; ZVBB-NEXT:    vslideup.vx v0, v16, a1
-; ZVBB-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 12
@@ -3676,23 +3657,21 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v14, 0
-; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    sub a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
-; CHECK-NEXT:    add a3, a4, a2
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a2, a3, a1
 ; CHECK-NEXT:    vmv4r.v v24, v16
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
-; CHECK-NEXT:    add a6, a3, a2
+; CHECK-NEXT:    add a4, a2, a1
 ; CHECK-NEXT:    vmv1r.v v25, v22
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
@@ -3704,41 +3683,41 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v28, v20
 ; CHECK-NEXT:    vmv1r.v v18, v23
-; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    add a5, a4, a1
 ; CHECK-NEXT:    vmv1r.v v29, v10
 ; CHECK-NEXT:    vmv1r.v v20, v9
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
 ; CHECK-NEXT:    vmv1r.v v22, v11
-; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsseg7e8.v v24, (a4)
+; CHECK-NEXT:    vsetvli a6, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg7e8.v v24, (a3)
 ; CHECK-NEXT:    vmv1r.v v23, v31
 ; CHECK-NEXT:    vsseg7e8.v v17, (a0)
-; CHECK-NEXT:    vl1r.v v8, (a6)
-; CHECK-NEXT:    add a6, a7, a2
-; CHECK-NEXT:    vl1r.v v10, (a4)
-; CHECK-NEXT:    add a4, a6, a2
-; CHECK-NEXT:    vl1r.v v12, (a6)
-; CHECK-NEXT:    add a6, a4, a2
-; CHECK-NEXT:    vl1r.v v14, (a6)
-; CHECK-NEXT:    add a6, a5, a2
-; CHECK-NEXT:    vl1r.v v16, (a5)
-; CHECK-NEXT:    add a5, a6, a2
-; CHECK-NEXT:    vl1r.v v18, (a5)
-; CHECK-NEXT:    add a5, a5, a2
-; CHECK-NEXT:    vl1r.v v9, (a7)
-; CHECK-NEXT:    add a7, a5, a2
-; CHECK-NEXT:    vl1r.v v20, (a7)
-; CHECK-NEXT:    add a7, a7, a2
-; CHECK-NEXT:    srli a2, a2, 1
-; CHECK-NEXT:    vl1r.v v11, (a3)
-; CHECK-NEXT:    add a3, a1, a1
-; CHECK-NEXT:    vl1r.v v13, (a4)
-; CHECK-NEXT:    add a4, a2, a2
+; CHECK-NEXT:    vl1r.v v8, (a4)
+; CHECK-NEXT:    add a4, a5, a1
+; CHECK-NEXT:    vl1r.v v10, (a3)
+; CHECK-NEXT:    add a6, a4, a1
+; CHECK-NEXT:    vl1r.v v12, (a4)
+; CHECK-NEXT:    add a3, a6, a1
+; CHECK-NEXT:    vl1r.v v14, (a3)
+; CHECK-NEXT:    srli a3, a1, 1
+; CHECK-NEXT:    vl1r.v v9, (a5)
+; CHECK-NEXT:    add a4, a0, a1
+; CHECK-NEXT:    vl1r.v v16, (a4)
+; CHECK-NEXT:    add a4, a4, a1
+; CHECK-NEXT:    vl1r.v v11, (a2)
+; CHECK-NEXT:    add a2, a4, a1
+; CHECK-NEXT:    vl1r.v v18, (a2)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    vl1r.v v13, (a6)
+; CHECK-NEXT:    add a5, a2, a1
+; CHECK-NEXT:    vl1r.v v20, (a5)
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vl1r.v v15, (a0)
-; CHECK-NEXT:    vl1r.v v19, (a5)
-; CHECK-NEXT:    vl1r.v v17, (a6)
-; CHECK-NEXT:    vl1r.v v21, (a7)
+; CHECK-NEXT:    vl1r.v v19, (a2)
+; CHECK-NEXT:    vl1r.v v17, (a4)
+; CHECK-NEXT:    vl1r.v v21, (a5)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v22, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -3747,13 +3726,13 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    vmsne.vi v11, v18, 0
 ; CHECK-NEXT:    vmsne.vi v8, v16, 0
 ; CHECK-NEXT:    vmsne.vi v12, v20, 0
-; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v22, a1
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    vslideup.vx v8, v12, a2
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a3
+; CHECK-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 14
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -3770,23 +3749,21 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmv.v.i v14, 0
-; ZVBB-NEXT:    addi a4, sp, 16
+; ZVBB-NEXT:    addi a3, sp, 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 3
 ; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    add a0, sp, a0
 ; ZVBB-NEXT:    addi a0, a0, 16
-; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
-; ZVBB-NEXT:    add a3, a4, a2
-; ZVBB-NEXT:    srli a1, a2, 2
-; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    add a2, a3, a1
 ; ZVBB-NEXT:    vmv4r.v v24, v16
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
-; ZVBB-NEXT:    add a6, a3, a2
+; ZVBB-NEXT:    add a4, a2, a1
 ; ZVBB-NEXT:    vmv1r.v v25, v22
 ; ZVBB-NEXT:    vmv1r.v v0, v10
 ; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
@@ -3798,41 +3775,41 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v28, v20
 ; ZVBB-NEXT:    vmv1r.v v18, v23
-; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    add a5, a4, a1
 ; ZVBB-NEXT:    vmv1r.v v29, v10
 ; ZVBB-NEXT:    vmv1r.v v20, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v13
 ; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v22, v11
-; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
+; ZVBB-NEXT:    vsetvli a6, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e8.v v24, (a3)
 ; ZVBB-NEXT:    vmv1r.v v23, v31
 ; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
-; ZVBB-NEXT:    vl1r.v v8, (a6)
-; ZVBB-NEXT:    add a6, a7, a2
-; ZVBB-NEXT:    vl1r.v v10, (a4)
-; ZVBB-NEXT:    add a4, a6, a2
-; ZVBB-NEXT:    vl1r.v v12, (a6)
-; ZVBB-NEXT:    add a6, a4, a2
-; ZVBB-NEXT:    vl1r.v v14, (a6)
-; ZVBB-NEXT:    add a6, a5, a2
-; ZVBB-NEXT:    vl1r.v v16, (a5)
-; ZVBB-NEXT:    add a5, a6, a2
-; ZVBB-NEXT:    vl1r.v v18, (a5)
-; ZVBB-NEXT:    add a5, a5, a2
-; ZVBB-NEXT:    vl1r.v v9, (a7)
-; ZVBB-NEXT:    add a7, a5, a2
-; ZVBB-NEXT:    vl1r.v v20, (a7)
-; ZVBB-NEXT:    add a7, a7, a2
-; ZVBB-NEXT:    srli a2, a2, 1
-; ZVBB-NEXT:    vl1r.v v11, (a3)
-; ZVBB-NEXT:    add a3, a1, a1
-; ZVBB-NEXT:    vl1r.v v13, (a4)
-; ZVBB-NEXT:    add a4, a2, a2
+; ZVBB-NEXT:    vl1r.v v8, (a4)
+; ZVBB-NEXT:    add a4, a5, a1
+; ZVBB-NEXT:    vl1r.v v10, (a3)
+; ZVBB-NEXT:    add a6, a4, a1
+; ZVBB-NEXT:    vl1r.v v12, (a4)
+; ZVBB-NEXT:    add a3, a6, a1
+; ZVBB-NEXT:    vl1r.v v14, (a3)
+; ZVBB-NEXT:    srli a3, a1, 1
+; ZVBB-NEXT:    vl1r.v v9, (a5)
+; ZVBB-NEXT:    add a4, a0, a1
+; ZVBB-NEXT:    vl1r.v v16, (a4)
+; ZVBB-NEXT:    add a4, a4, a1
+; ZVBB-NEXT:    vl1r.v v11, (a2)
+; ZVBB-NEXT:    add a2, a4, a1
+; ZVBB-NEXT:    vl1r.v v18, (a2)
+; ZVBB-NEXT:    add a2, a2, a1
+; ZVBB-NEXT:    vl1r.v v13, (a6)
+; ZVBB-NEXT:    add a5, a2, a1
+; ZVBB-NEXT:    vl1r.v v20, (a5)
+; ZVBB-NEXT:    add a5, a5, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vl1r.v v15, (a0)
-; ZVBB-NEXT:    vl1r.v v19, (a5)
-; ZVBB-NEXT:    vl1r.v v17, (a6)
-; ZVBB-NEXT:    vl1r.v v21, (a7)
+; ZVBB-NEXT:    vl1r.v v19, (a2)
+; ZVBB-NEXT:    vl1r.v v17, (a4)
+; ZVBB-NEXT:    vl1r.v v21, (a5)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v22, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v10, 0
@@ -3841,13 +3818,13 @@ define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    vmsne.vi v11, v18, 0
 ; ZVBB-NEXT:    vmsne.vi v8, v16, 0
 ; ZVBB-NEXT:    vmsne.vi v12, v20, 0
-; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v22, a1
 ; ZVBB-NEXT:    vslideup.vx v9, v10, a1
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
-; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    vslideup.vx v8, v12, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a3
+; ZVBB-NEXT:    vslideup.vx v8, v12, a3
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 14
 ; ZVBB-NEXT:    mul a0, a0, a1
@@ -5569,54 +5546,52 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1
 ; CHECK-NEXT:    add a6, a4, a0
 ; CHECK-NEXT:    add a7, a5, a0
 ; CHECK-NEXT:    add t0, a6, a0
-; CHECK-NEXT:    add t1, a7, a0
-; CHECK-NEXT:    add t2, t0, a0
 ; CHECK-NEXT:    vmv1r.v v20, v9
-; CHECK-NEXT:    add t3, t1, a0
+; CHECK-NEXT:    add t1, a7, a0
 ; CHECK-NEXT:    vmv1r.v v22, v11
 ; CHECK-NEXT:    vsseg8e8.v v16, (a1)
-; CHECK-NEXT:    vl1r.v v10, (t1)
-; CHECK-NEXT:    add t1, t2, a0
-; CHECK-NEXT:    vl1r.v v12, (a5)
-; CHECK-NEXT:    add a5, t3, a0
+; CHECK-NEXT:    vl1r.v v8, (a5)
+; CHECK-NEXT:    add a5, t0, a0
+; CHECK-NEXT:    vl1r.v v12, (t1)
+; CHECK-NEXT:    add t1, t1, a0
 ; CHECK-NEXT:    vl1r.v v14, (a2)
-; CHECK-NEXT:    add a2, t1, a0
+; CHECK-NEXT:    add a2, a5, a0
+; CHECK-NEXT:    vl1r.v v10, (a5)
+; CHECK-NEXT:    add a5, t1, a0
 ; CHECK-NEXT:    vl1r.v v16, (a5)
 ; CHECK-NEXT:    add a5, a5, a0
-; CHECK-NEXT:    vl1r.v v8, (a2)
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    vl1r.v v18, (t2)
 ; CHECK-NEXT:    vl1r.v v17, (a5)
-; CHECK-NEXT:    vl1r.v v11, (t3)
-; CHECK-NEXT:    vl1r.v v13, (a7)
+; CHECK-NEXT:    add a5, a2, a0
+; CHECK-NEXT:    vl1r.v v18, (a5)
+; CHECK-NEXT:    add a5, a5, a0
+; CHECK-NEXT:    vl1r.v v13, (t1)
+; CHECK-NEXT:    vl1r.v v9, (a7)
 ; CHECK-NEXT:    vl1r.v v15, (a3)
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v20, v16, 0
-; CHECK-NEXT:    vmsne.vi v16, v10, 0
-; CHECK-NEXT:    vl1r.v v10, (a6)
-; CHECK-NEXT:    vmsne.vi v17, v12, 0
+; CHECK-NEXT:    vmsne.vi v16, v12, 0
+; CHECK-NEXT:    vl1r.v v12, (a6)
+; CHECK-NEXT:    vmsne.vi v17, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v14, 0
-; CHECK-NEXT:    vl1r.v v12, (a1)
-; CHECK-NEXT:    vl1r.v v9, (a2)
-; CHECK-NEXT:    vl1r.v v19, (t1)
-; CHECK-NEXT:    vl1r.v v11, (t0)
-; CHECK-NEXT:    vl1r.v v13, (a4)
-; CHECK-NEXT:    vmsne.vi v14, v8, 0
+; CHECK-NEXT:    vl1r.v v14, (a1)
+; CHECK-NEXT:    vl1r.v v19, (a5)
+; CHECK-NEXT:    vl1r.v v11, (a2)
+; CHECK-NEXT:    vl1r.v v13, (t0)
+; CHECK-NEXT:    vl1r.v v15, (a4)
 ; CHECK-NEXT:    vmsne.vi v9, v18, 0
-; CHECK-NEXT:    vmsne.vi v15, v10, 0
-; CHECK-NEXT:    vmsne.vi v8, v12, 0
+; CHECK-NEXT:    vmsne.vi v18, v10, 0
+; CHECK-NEXT:    vmsne.vi v10, v12, 0
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v20, a1
 ; CHECK-NEXT:    vslideup.vx v0, v17, a1
-; CHECK-NEXT:    vslideup.vx v9, v14, a1
-; CHECK-NEXT:    vslideup.vx v8, v15, a1
+; CHECK-NEXT:    vslideup.vx v18, v9, a1
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    srli a0, a0, 1
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
-; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    vslideup.vx v8, v18, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -5670,54 +5645,52 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1
 ; ZVBB-NEXT:    add a6, a4, a0
 ; ZVBB-NEXT:    add a7, a5, a0
 ; ZVBB-NEXT:    add t0, a6, a0
-; ZVBB-NEXT:    add t1, a7, a0
-; ZVBB-NEXT:    add t2, t0, a0
 ; ZVBB-NEXT:    vmv1r.v v20, v9
-; ZVBB-NEXT:    add t3, t1, a0
+; ZVBB-NEXT:    add t1, a7, a0
 ; ZVBB-NEXT:    vmv1r.v v22, v11
 ; ZVBB-NEXT:    vsseg8e8.v v16, (a1)
-; ZVBB-NEXT:    vl1r.v v10, (t1)
-; ZVBB-NEXT:    add t1, t2, a0
-; ZVBB-NEXT:    vl1r.v v12, (a5)
-; ZVBB-NEXT:    add a5, t3, a0
+; ZVBB-NEXT:    vl1r.v v8, (a5)
+; ZVBB-NEXT:    add a5, t0, a0
+; ZVBB-NEXT:    vl1r.v v12, (t1)
+; ZVBB-NEXT:    add t1, t1, a0
 ; ZVBB-NEXT:    vl1r.v v14, (a2)
-; ZVBB-NEXT:    add a2, t1, a0
+; ZVBB-NEXT:    add a2, a5, a0
+; ZVBB-NEXT:    vl1r.v v10, (a5)
+; ZVBB-NEXT:    add a5, t1, a0
 ; ZVBB-NEXT:    vl1r.v v16, (a5)
 ; ZVBB-NEXT:    add a5, a5, a0
-; ZVBB-NEXT:    vl1r.v v8, (a2)
-; ZVBB-NEXT:    add a2, a2, a0
-; ZVBB-NEXT:    vl1r.v v18, (t2)
 ; ZVBB-NEXT:    vl1r.v v17, (a5)
-; ZVBB-NEXT:    vl1r.v v11, (t3)
-; ZVBB-NEXT:    vl1r.v v13, (a7)
+; ZVBB-NEXT:    add a5, a2, a0
+; ZVBB-NEXT:    vl1r.v v18, (a5)
+; ZVBB-NEXT:    add a5, a5, a0
+; ZVBB-NEXT:    vl1r.v v13, (t1)
+; ZVBB-NEXT:    vl1r.v v9, (a7)
 ; ZVBB-NEXT:    vl1r.v v15, (a3)
 ; ZVBB-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmsne.vi v20, v16, 0
-; ZVBB-NEXT:    vmsne.vi v16, v10, 0
-; ZVBB-NEXT:    vl1r.v v10, (a6)
-; ZVBB-NEXT:    vmsne.vi v17, v12, 0
+; ZVBB-NEXT:    vmsne.vi v16, v12, 0
+; ZVBB-NEXT:    vl1r.v v12, (a6)
+; ZVBB-NEXT:    vmsne.vi v17, v8, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v14, 0
-; ZVBB-NEXT:    vl1r.v v12, (a1)
-; ZVBB-NEXT:    vl1r.v v9, (a2)
-; ZVBB-NEXT:    vl1r.v v19, (t1)
-; ZVBB-NEXT:    vl1r.v v11, (t0)
-; ZVBB-NEXT:    vl1r.v v13, (a4)
-; ZVBB-NEXT:    vmsne.vi v14, v8, 0
+; ZVBB-NEXT:    vl1r.v v14, (a1)
+; ZVBB-NEXT:    vl1r.v v19, (a5)
+; ZVBB-NEXT:    vl1r.v v11, (a2)
+; ZVBB-NEXT:    vl1r.v v13, (t0)
+; ZVBB-NEXT:    vl1r.v v15, (a4)
 ; ZVBB-NEXT:    vmsne.vi v9, v18, 0
-; ZVBB-NEXT:    vmsne.vi v15, v10, 0
-; ZVBB-NEXT:    vmsne.vi v8, v12, 0
+; ZVBB-NEXT:    vmsne.vi v18, v10, 0
+; ZVBB-NEXT:    vmsne.vi v10, v12, 0
+; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    srli a1, a0, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v16, v20, a1
 ; ZVBB-NEXT:    vslideup.vx v0, v17, a1
-; ZVBB-NEXT:    vslideup.vx v9, v14, a1
-; ZVBB-NEXT:    vslideup.vx v8, v15, a1
+; ZVBB-NEXT:    vslideup.vx v18, v9, a1
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    srli a0, a0, 1
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v16, a0
-; ZVBB-NEXT:    vslideup.vx v8, v9, a0
+; ZVBB-NEXT:    vslideup.vx v8, v18, a0
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 4
 ; ZVBB-NEXT:    add sp, sp, a0
@@ -6294,14 +6267,12 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; V-NEXT:    vwaddu.vv v10, v8, v9
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a0
+; V-NEXT:    vslideup.vx v10, v8, a0
 ; V-NEXT:    vmv.v.v v8, v10
 ; V-NEXT:    ret
 ;
@@ -6314,8 +6285,6 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
@@ -6327,8 +6296,7 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZIP-NEXT:    vslideup.vx v10, v11, a0
 ; ZIP-NEXT:    vmv.v.v v8, v10
 ; ZIP-NEXT:    ret
@@ -6374,14 +6342,12 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; V-NEXT:    vwaddu.vv v10, v8, v9
 ; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
 ; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a0
+; V-NEXT:    vslideup.vx v10, v8, a0
 ; V-NEXT:    vmv.v.v v8, v10
 ; V-NEXT:    ret
 ;
@@ -6394,8 +6360,6 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
@@ -6407,8 +6371,7 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZIP-NEXT:    vslideup.vx v10, v11, a0
 ; ZIP-NEXT:    vmv.v.v v8, v10
 ; ZIP-NEXT:    ret
@@ -6807,8 +6770,7 @@ define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vle16.v v9, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6834,8 +6796,7 @@ define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vle16.v v9, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6967,8 +6928,7 @@ define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x
 ; CHECK-NEXT:    vle16.v v9, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -6994,8 +6954,7 @@ define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    vle16.v v9, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -7127,8 +7086,7 @@ define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v9, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -7154,8 +7112,7 @@ define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v9, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v9, a1
 ; ZVBB-NEXT:    add a2, a3, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -7391,13 +7348,12 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7422,13 +7378,12 @@ define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7559,13 +7514,12 @@ define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7590,13 +7544,12 @@ define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7727,13 +7680,12 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v9, (a4)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v10, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
@@ -7758,13 +7710,12 @@ define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v9, (a4)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v10, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
@@ -7998,13 +7949,12 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v8, (a5)
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8034,13 +7984,12 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v8, (a5)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8466,13 +8415,12 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v8, (a5)
 ; CHECK-NEXT:    vle16.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8502,13 +8450,12 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v8, (a5)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
@@ -8934,13 +8881,12 @@ define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v8, (a5)
 ; CHECK-NEXT:    vle32.v v9, (a4)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a4, a1, a1
 ; CHECK-NEXT:    vle32.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    add a2, a5, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -8970,13 +8916,12 @@ define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v8, (a5)
 ; ZVBB-NEXT:    vle32.v v9, (a4)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a4, a1, a1
 ; ZVBB-NEXT:    vle32.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
 ; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    add a2, a5, a2
 ; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
@@ -9796,18 +9741,17 @@ define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -9836,18 +9780,17 @@ define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -10311,18 +10254,17 @@ define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -10351,18 +10293,17 @@ define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -10826,18 +10767,17 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v10, (a6)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle32.v v11, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v11, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
@@ -10866,18 +10806,17 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v10, (a6)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle32.v v11, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v11, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v11, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
@@ -11761,7 +11700,6 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -11771,20 +11709,20 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle16.v v8, (a7)
 ; CHECK-NEXT:    vle16.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle16.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a2)
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -11801,7 +11739,6 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -11811,20 +11748,20 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle16.v v8, (a7)
 ; ZVBB-NEXT:    vle16.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle16.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a2)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -12325,7 +12262,6 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -12335,20 +12271,20 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle16.v v8, (a7)
 ; CHECK-NEXT:    vle16.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle16.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v11, (a2)
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -12365,7 +12301,6 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -12375,20 +12310,20 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle16.v v8, (a7)
 ; ZVBB-NEXT:    vle16.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle16.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v11, (a2)
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -12889,7 +12824,6 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 3
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
@@ -12899,20 +12833,20 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    add a7, a6, a2
 ; CHECK-NEXT:    vle32.v v8, (a7)
 ; CHECK-NEXT:    vle32.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    srli a1, a1, 3
 ; CHECK-NEXT:    add a2, a7, a2
 ; CHECK-NEXT:    vle32.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v11, (a2)
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v12, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -12929,7 +12863,6 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 3
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
@@ -12939,20 +12872,20 @@ define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vle32.v v8, (a7)
 ; ZVBB-NEXT:    vle32.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    srli a1, a1, 3
 ; ZVBB-NEXT:    add a2, a7, a2
 ; ZVBB-NEXT:    vle32.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
 ; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v11, (a2)
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
 ; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v12, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -13945,23 +13878,22 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    vle16.v v11, (t0)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -13990,23 +13922,22 @@ define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    vle16.v v11, (t0)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -14243,23 +14174,22 @@ define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    vle16.v v11, (t0)
 ; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle16.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v10, (a6)
 ; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v12, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -14288,23 +14218,22 @@ define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    vle16.v v11, (t0)
 ; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle16.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v10, (a6)
 ; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vle16.v v12, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
@@ -14541,23 +14470,22 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x flo
 ; CHECK-NEXT:    vle32.v v11, (t0)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a2, a1, a1
 ; CHECK-NEXT:    vle32.v v9, (a7)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v8, a1
-; CHECK-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v10, (a6)
 ; CHECK-NEXT:    vle32.v v8, (a5)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vle32.v v12, (a3)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -14586,23 +14514,22 @@ define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x flo
 ; ZVBB-NEXT:    vle32.v v11, (t0)
 ; ZVBB-NEXT:    vle32.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a2, a1, a1
 ; ZVBB-NEXT:    vle32.v v9, (a7)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v11, v8, a1
-; ZVBB-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v10, (a6)
 ; ZVBB-NEXT:    vle32.v v8, (a5)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v9, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; ZVBB-NEXT:    vle32.v v12, (a3)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index df7af4d8b1667..111fa368ac155 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -634,12 +634,11 @@ define <vscale x 32 x i1> @vfptosi_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v16
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v24
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vand.vi v12, v12, 1
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
   %evec = fptosi <vscale x 32 x bfloat> %va to <vscale x 32 x i1>
@@ -656,12 +655,11 @@ define <vscale x 32 x i1> @vfptoui_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v16
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v24
-; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vand.vi v12, v12, 1
 ; CHECK-NEXT:    vmsne.vi v16, v8, 0
 ; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
   %evec = fptoui <vscale x 32 x bfloat> %va to <vscale x 32 x i1>
@@ -1654,12 +1652,11 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v16
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v24
-; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -1684,12 +1681,11 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v16
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v24
-; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 142ee5256f9e7..1868154052272 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -567,38 +567,37 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v8, v0
-; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v11, v11
-; RV32-NEXT:    vwmaccu.vx v12, a1, v11
+; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v11, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v11, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v11, 0
-; RV32-NEXT:    add a1, a3, a3
+; RV32-NEXT:    slli a3, a1, 1
 ; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a3
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v9, a2
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wx v13, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vnsrl.wi v12, v10, 0
-; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    srli a3, a3, 1
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV32-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
@@ -611,26 +610,24 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a1, 33
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v11, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v11, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v11, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v11, 0
-; RV64-NEXT:    add a1, a3, a3
+; RV64-NEXT:    slli a3, a1, 33
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v9, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    srli a1, a4, 32
+; RV64-NEXT:    srli a1, a3, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
 ; RV64-NEXT:    li a1, 32
@@ -638,9 +635,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vnsrl.wx v13, v10, a1
 ; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vnsrl.wi v12, v10, 0
-; RV64-NEXT:    srli a4, a4, 33
+; RV64-NEXT:    srli a3, a3, 33
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
@@ -807,10 +804,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV32-NEXT:    srli a3, a3, 3
 ; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vx v8, v12, a3
-; RV32-NEXT:    add a4, a3, a3
-; RV32-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV32-NEXT:    vslideup.vx v12, v8, a3
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwaddu.vv v16, v12, v9
 ; RV32-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
@@ -831,10 +825,7 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
 ; RV64-NEXT:    srli a3, a3, 3
 ; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vx v8, v12, a3
-; RV64-NEXT:    add a4, a3, a3
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
 ; RV64-NEXT:    vslideup.vx v12, v8, a3
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vwaddu.vv v16, v12, v9
 ; RV64-NEXT:    vwmaccu.vx v16, a2, v9
 ; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
@@ -858,29 +849,28 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v9, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v9, v11
 ; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v9, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v9, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
-; RV32-NEXT:    add a2, a3, a3
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v8, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
@@ -899,26 +889,24 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 33
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV64-NEXT:    vmv1r.v v0, v9
 ; RV64-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v9, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v9, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v9, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
-; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    slli a1, a1, 33
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v8, a3
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v8, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 3da04eb7e6abe..78aae96242fd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -887,9 +887,9 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv3f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -906,8 +906,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv6f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -925,8 +924,7 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -944,9 +942,8 @@ define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -965,9 +962,9 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -984,8 +981,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v11, a1
@@ -1002,13 +998,12 @@ declare half @llvm.vector.reduce.fmin.nxv10f16(<vscale x 10 x half>)
 define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-LABEL: vreduce_fmin_nxv10f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    lui a1, %hi(.LCPI73_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI73_0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI73_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI73_0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v12, v8, v12
@@ -1024,9 +1019,8 @@ define half @vreduce_fmax_nxv12f16(<vscale x 12 x half> %v) {
 ; CHECK-LABEL: vreduce_fmax_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    li a1, -512
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dda..522c83fd9fa99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
   ret <vscale x 2 x i64> %v
 }
 
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
 declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    bltu a2, a1, .LBB48_2
+; CHECK-NEXT:    bltu a2, a1, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:  .LBB48_2:
+; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
index 7442be92fffcd..83192930f5cdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
@@ -921,3 +921,801 @@ define <vscale x 8 x i32> @vsub_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = sub <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
+
+; Make sure we are able to split a type that isn't an MVT even if the scalar
+; element type isn't legal on RV32. This used to crash.
+define <vscale x 64 x i64> @vsub_vv_nxv64i64(<vscale x 64 x i64> %va, <vscale x 64 x i64> %vb) {
+; RV32-LABEL: vsub_vv_nxv64i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    .cfi_def_cfa_offset 80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a3, a3, a2
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 88 * vlenb
+; RV32-NEXT:    mv s2, a7
+; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr s6, vlenb
+; RV32-NEXT:    slli s7, s6, 4
+; RV32-NEXT:    slli s8, s6, 3
+; RV32-NEXT:    add a1, a7, s7
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, a7, s8
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    add a0, s3, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    slli s9, s6, 5
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s3, s9
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s2, s4
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, s3, a0
+; RV32-NEXT:    add a2, s2, s9
+; RV32-NEXT:    add a3, s3, s7
+; RV32-NEXT:    add a4, s2, a0
+; RV32-NEXT:    add s3, s3, s8
+; RV32-NEXT:    vl8re64.v v8, (s2)
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    mv a6, a5
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 32
+; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a2)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a4)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a3)
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v24, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v0, v0, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    vs8r.v v0, (s0)
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s1)
+; RV32-NEXT:    add s5, s0, s5
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s5)
+; RV32-NEXT:    add s4, s0, s4
+; RV32-NEXT:    vs8r.v v16, (s4)
+; RV32-NEXT:    add s9, s0, s9
+; RV32-NEXT:    add a0, s0, a0
+; RV32-NEXT:    add s7, s0, s7
+; RV32-NEXT:    add s0, s0, s8
+; RV32-NEXT:    vs8r.v v8, (s9)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 6
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsub_vv_nxv64i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    mv a3, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a3, a3, a2
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0x80, 0x01, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 128 + 88 * vlenb
+; RV64-NEXT:    mv s2, a7
+; RV64-NEXT:    mv s3, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr s6, vlenb
+; RV64-NEXT:    slli s7, s6, 4
+; RV64-NEXT:    slli s8, s6, 3
+; RV64-NEXT:    add a1, a7, s7
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, a7, s8
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s1, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 40
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s4, a0
+; RV64-NEXT:    add a0, s3, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s5, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    slli s9, s6, 5
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s3, s9
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s2, s4
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, s3, a0
+; RV64-NEXT:    add a2, s2, s9
+; RV64-NEXT:    add a3, s3, s7
+; RV64-NEXT:    add a4, s2, a0
+; RV64-NEXT:    add s3, s3, s8
+; RV64-NEXT:    vl8re64.v v8, (s2)
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 3
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a2)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    mv a5, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a4)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a3)
+; RV64-NEXT:    addi a2, sp, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v24, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v0, v0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    addi a1, sp, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a2, a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16
+; RV64-NEXT:    vs8r.v v0, (s0)
+; RV64-NEXT:    add s1, s0, s1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s1)
+; RV64-NEXT:    add s5, s0, s5
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s5)
+; RV64-NEXT:    add s4, s0, s4
+; RV64-NEXT:    vs8r.v v16, (s4)
+; RV64-NEXT:    add s9, s0, s9
+; RV64-NEXT:    add a0, s0, a0
+; RV64-NEXT:    add s7, s0, s7
+; RV64-NEXT:    add s0, s0, s8
+; RV64-NEXT:    vs8r.v v8, (s9)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 6
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    .cfi_restore s5
+; RV64-NEXT:    .cfi_restore s6
+; RV64-NEXT:    .cfi_restore s7
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %vc = sub <vscale x 64 x i64> %va, %vb
+  ret <vscale x 64 x i64> %vc
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
new file mode 100644
index 0000000000000..fe2bcf00ba7d4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
@@ -0,0 +1,958 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,experimental-xqcibm -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBBXQCIBM
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @test_cttz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    li a1, 256
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    li a1, 256
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    lui a1, 16
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI2_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI2_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s2, a0
+; RV32I-NEXT:    or a0, s2, s3
+; RV32I-NEXT:    beqz a0, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a0, s2
+; RV32I-NEXT:    and a0, s2, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s2, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s4, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB3_5
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    j .LBB3_6
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s4, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB3_5: # %cond.false
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:  .LBB3_6: # %cond.end
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB3_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB3_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB3_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB3_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s4, a0
+; RV32I-NEXT:    neg a0, s4
+; RV32I-NEXT:    and a0, s4, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s2, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s2, s2, %lo(.LCPI7_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s4, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s2, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s2, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB7_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB7_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB7_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB7_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a1, 17
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a3, a1
+; RV32I-NEXT:    not a4, a0
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    beqz a0, .LBB11_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a2, a0, 1365
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    addi a0, a5, -241
+; RV32I-NEXT:    bnez a3, .LBB11_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a3, a4, 1
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_4:
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB11_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB11_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB11_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB11_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 5
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    srli a2, a2, 17
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a4, a1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a3, a1, 1365
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    addi a1, a5, -241
+; RV32I-NEXT:    bnez a4, .LBB15_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB15_2:
+; RV32I-NEXT:    srli a0, a4, 1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB15_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB15_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB15_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB15_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}
diff --git a/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
new file mode 100644
index 0000000000000..299d61d3bffde
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
@@ -0,0 +1,73 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; CHECK-DAG: OpDecorate [[bool_const:%[0-9]+]] SpecId 1
+; CHECK-DAG: OpDecorate [[short_const:%[0-9]+]] SpecId 2
+; CHECK-DAG: OpDecorate [[int_const:%[0-9]+]] SpecId 3
+; CHECK-DAG: OpDecorate [[long_const:%[0-9]+]] SpecId 4
+; CHECK-DAG: OpDecorate [[float_const:%[0-9]+]] SpecId 8
+; CHECK-DAG: OpDecorate [[double_const:%[0-9]+]] SpecId 9
+; CHECK-DAG: OpDecorate [[enum_const:%[0-9]+]] SpecId 10
+
+; CHECK-DAG: [[bool_const]] = OpSpecConstantTrue {{%[0-9]+}}
+; CHECK-DAG: [[short_const]] = OpSpecConstant {{%[0-9]+}} 4
+; CHECK-DAG: [[int_const]] = OpSpecConstant {{%[0-9]+}} 5
+; CHECK-DAG: [[long_const]] = OpSpecConstant {{%[0-9]+}} 8
+; CHECK-DAG: [[float_const]] = OpSpecConstant {{%[0-9]+}} 1112014848
+; CHECK-DAG: [[double_const]] = OpSpecConstant {{%[0-9]+}} 0 1079574528
+; CHECK-DAG: [[enum_const]] = OpSpecConstant {{%[0-9]+}} 30
+
+@_ZL10bool_const = internal addrspace(10) global i32 0, align 4
+@_ZL11short_const = internal addrspace(10) global i16 0, align 2
+@_ZL9int_const = internal addrspace(10) global i32 0, align 4
+@_ZL10long_const = internal addrspace(10) global i64 0, align 8
+@_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
+@_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
+@_ZL10enum_const = internal addrspace(10) global i32 0, align 4
+
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @main() local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[b:%[0-9]+]] = OpSelect {{%[0-9]+}} [[bool_const]]
+  ; CHECK: OpStore {{%[0-9]+}} [[b]]
+  %0 = tail call spir_func i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
+  %storedv.i.i = zext i1 %0 to i32
+  store i32 %storedv.i.i, ptr addrspace(10) @_ZL10bool_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[short_const]]
+  %2 = tail call spir_func i16 @_Z20__spirv_SpecConstantis(i32 2, i16 4)
+  store i16 %2, ptr addrspace(10) @_ZL11short_const, align 2
+
+  ; CHECK: OpStore {{%[0-9]+}} [[int_const]]
+  %4 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
+  store i32 %4, ptr addrspace(10) @_ZL9int_const, align 4
+
+
+  ; CHECK: OpStore {{%[0-9]+}} [[long_const]]
+  %6 = tail call spir_func i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
+  store i64 %6, ptr addrspace(10) @_ZL10long_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[float_const]]
+  %14 = tail call reassoc nnan ninf nsz arcp afn spir_func float @_Z20__spirv_SpecConstantif(i32 8, float 5.000000e+01)
+  store float %14, ptr addrspace(10) @_ZL11float_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[double_const]]
+  %16 = tail call reassoc nnan ninf nsz arcp afn spir_func double @_Z20__spirv_SpecConstantid(i32 9, double 1.000000e+02)
+  store double %16, ptr addrspace(10) @_ZL12double_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[enum_const]]
+  %18 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
+  store i32 %18, ptr addrspace(10) @_ZL10enum_const, align 4
+  ret void
+}
+
+
+declare i1 @_Z20__spirv_SpecConstantib(i32, i1)
+declare i8 @_Z20__spirv_SpecConstantia(i32, i8)
+declare i16 @_Z20__spirv_SpecConstantis(i32, i16)
+declare i32 @_Z20__spirv_SpecConstantii(i32, i32)
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+declare float @_Z20__spirv_SpecConstantif(i32, float)
+declare double @_Z20__spirv_SpecConstantid(i32, double)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 14e3767f65564..38ea796c0fcb0 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -424,7 +424,7 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 04d7a9691b645..3aa77c3955c63 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -317,8 +317,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVXVNNI-NEXT:    vmovd %xmm2, %eax
 ; AVXVNNI-NEXT:    addl %edx, %eax
@@ -328,9 +328,9 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
@@ -343,8 +343,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index dfae853f9961e..456e6e8f263aa 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -27,7 +27,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_zc:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -36,7 +36,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_zc:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -47,7 +47,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_zc:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
@@ -67,7 +67,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVXVNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVXVNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -78,7 +78,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVX512VNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -107,7 +107,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_cs:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
@@ -117,7 +117,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_cs:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
@@ -129,7 +129,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_cs:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/fixup-blend.ll b/llvm/test/CodeGen/X86/fixup-blend.ll
index 3126e4823bee6..d64dd6d3114a6 100644
--- a/llvm/test/CodeGen/X86/fixup-blend.ll
+++ b/llvm/test/CodeGen/X86/fixup-blend.ll
@@ -59,21 +59,45 @@ define <2 x double> @test_v2f64_blend_movsd_optsize(<2 x double> %a0, <2 x doubl
   ret <2 x double> %r
 }
 
-define <2 x double> @test_v2f64_blend_movsd_load(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+define <2 x double> @test_v2f64_blend_movsd_load(ptr %p0, <2 x double> %a1, <2 x double> %a2) {
 ; SSE2-LABEL: test_v2f64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2f64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE4-NEXT:    addpd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <2 x double>, ptr %p0
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_load_commute(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+; SSE2-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    addpd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
 ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -178,27 +202,57 @@ define <2 x i64> @test_v2i64_blend_movsd_optsize(<2 x i64> %a0, <2 x i64> %a1, <
   ret <2 x i64> %r
 }
 
-define <2 x i64> @test_v2i64_blend_movsd_load(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+define <2 x i64> @test_v2i64_blend_movsd_load(ptr %p0, <2 x i64> %a1, <2 x i64> %a2) {
 ; SSE2-LABEL: test_v2i64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2i64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddq %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v2i64_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v2i64_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <2 x i64>, ptr %p0
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_load_commute(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -338,21 +392,47 @@ define <4 x float> @test_v4f32_blend_movsd_optsize(<4 x float> %a0, <4 x float>
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movss_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4-LABEL: test_v4f32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX-LABEL: test_v4f32_blend_movss_load_commute:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
@@ -363,21 +443,45 @@ define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x f
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movsd_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movsd_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4f32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -580,27 +684,59 @@ define <4 x i32> @test_v4i32_blend_movsd_optsize(<4 x i32> %a0, <4 x i32> %a1, <
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movss_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4-LABEL: test_v4i32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -611,27 +747,57 @@ define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32>
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movsd_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movsd_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4i32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i32_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v4i32_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 0afc4f784bc5e..568150cfa3971 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 84ae818d91832..05c855ed90b3f 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1014,7 +1014,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1023,7 +1023,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1038,7 +1038,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 4898ae98faea2..983ae594e3ab1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -112,7 +112,7 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 937ac3d2db885..d99b200385585 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -231,7 +231,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX2-LABEL: test_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    retq
@@ -239,7 +239,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX512-LABEL: test_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 6cc0e1e73fcdb..aed4e023e340c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1025,19 +1025,28 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
new file mode 100644
index 0000000000000..f099d4fddcb33
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
@@ -0,0 +1,318 @@
+# RUN: split-file %s %t
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+;--- alloc_no_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/alloc_no_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=ALLOC-NO-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/alloc_no_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# ALLOC-NO-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'alloc_no_dealloc':
+# ALLOC-NO-DEALLOC-SAME: The prolog made a stack allocation, but the epilog did not deallocate it
+
+--- |
+  define dso_local void @alloc_no_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            alloc_no_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- missed_push.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=MISSED-PUSH
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MISSED-PUSH: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'missed_push':
+# MISSED-PUSH-SAME: The prolog pushed more registers than the epilog popped
+
+--- |
+  define dso_local void @missed_push() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            missed_push
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_no_alloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_no_alloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-NO-ALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/dealloc_no_alloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DEALLOC-NO-ALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_no_alloc':
+# DEALLOC-NO-ALLOC-SAME: The epilog is deallocating a stack allocation, but the prolog did not allocate one
+
+--- |
+  define dso_local void @dealloc_no_alloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_no_alloc
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- double_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=DOUBLE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DOUBLE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'double_dealloc':
+# DOUBLE-DEALLOC-SAME: The epilog is deallocating the stack allocation more than once
+
+--- |
+  define dso_local void @double_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            double_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
+# RUN:    --check-prefix=BESTEFFORT
+# DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
+# DEALLOC-AFTER-EPILOG-SAME: Unexpected mov or add instruction after the epilog
+
+--- |
+  define dso_local void @dealloc_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    RET64
+...
+
+;--- pop_before_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_before_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-BEFORE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_dealloc':
+# POP-BEFORE-DEALLOC-SAME: Cannot pop registers before the stack allocation has been deallocated
+
+--- |
+  define dso_local void @pop_before_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_before_dealloc
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- too_many_pops.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=TOO-MANY-POPS
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# TOO-MANY-POPS: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'too_many_pops':
+# TOO-MANY-POPS-SAME: The epilog is popping more registers than the prolog pushed
+
+--- |
+  define dso_local void @too_many_pops() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            too_many_pops
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_in_wrong_order.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_in_wrong_order.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-WRONG-ORDER
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_in_wrong_order.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-WRONG-ORDER: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_in_wrong_order':
+# POP-WRONG-ORDER-SAME: The epilog is popping a registers in a different order than the prolog pushed them
+
+--- |
+  define dso_local void @pop_in_wrong_order() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_in_wrong_order
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_after_epilog':
+# POP-AFTER-EPILOG-SAME: Registers are being popped after the epilog
+
+--- |
+  define dso_local void @pop_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    RET64
+...
+
+;--- instr_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/instr_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=INSTR-AFTER-END
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/instr_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# INSTR-AFTER-END: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'instr_after_epilog':
+# INSTR-AFTER-END-SAME: Unexpected instruction in or after the epilog
+
+--- |
+  define dso_local void @instr_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            instr_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $ecx = MOV32rr killed $eax
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
new file mode 100644
index 0000000000000..70c87ad87f792
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
@@ -0,0 +1,94 @@
+# Require V2 and restrict the number of unwind codes to 8
+# RUN: not llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    2>&1 | FileCheck %s -check-prefix=REQUIREV2
+
+# Force best-effort and restrict the number of unwind codes to 8
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s -check-prefix=BESTEFFORT
+
+# Require V2, but allow the default number of unwind codes (255)
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 | FileCheck %s -check-prefix=ALLOWMORE
+
+# Usually 255 unwind codes are permitted, but we passed an arg to llc to limit
+# it to 8.
+# REQUIREV2: error: example.c:2:1: Windows x64 Unwind v2 is required, but the function 'too_many_epilogs' has too many unwind codes.
+# REQUIREV2-SAME: Try splitting the function or reducing the number of places where it exits early with a tail call.
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+# If we allow more epilogs then too_many_epilogs will compile with v2.
+# ALLOWMORE-LABEL: too_many_epilogs
+# ALLOWMORE: SEH_UnwindVersion 2
+# ALLOWMORE: SEH_UnwindV2Start
+
+--- |
+  define dso_local void @too_many_epilogs() local_unnamed_addr !dbg !9 {
+  entry:
+    ret void, !dbg !10
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2, !3, !4, !5}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "/app/example.c", directory: "/app")
+  !2 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"CodeView", i32 1}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = !DIFile(filename: "example.c", directory: "/app")
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{null}
+  !9 = distinct !DISubprogram(name: "too_many_epilogs", scope: !6, file: !6, line: 1, type: !7, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
+  !10 = !DILocation(line: 2, column: 1, scope: !9)
+  !11 = !DILocation(line: 3, column: 1, scope: !9)
+...
+---
+name:            too_many_epilogs
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !10
+  bb.1:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.2:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.3:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.4:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.5:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.6:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.7:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.8:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+
+...
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index ddd7f10168936..cacc43e96b6ea 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1329,7 +1329,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -1340,7 +1340,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1351,7 +1351,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2428,7 +2428,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -2439,7 +2439,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -2450,7 +2450,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4996,7 +4996,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
 ; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
@@ -5063,7 +5063,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5282,7 +5282,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5295,7 +5295,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5308,7 +5308,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -7347,9 +7347,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -7362,7 +7362,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7376,7 +7376,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7405,9 +7405,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -7419,7 +7419,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
@@ -7491,7 +7491,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7504,7 +7504,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7517,7 +7517,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ed53c3693c9dc..572ed314ab31d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4875,7 +4875,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5068,7 +5068,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -6847,7 +6847,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
diff --git a/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
new file mode 100644
index 0000000000000..23b7f087e9570
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
@@ -0,0 +1,86 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-dwarfdump -i %t | FileCheck %s
+
+# Test REL relocation handling for AMDGPU
+
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_producer ("dxc")
+# CHECK: DW_AT_name (".\\example.hlsl")
+# CHECK: DW_AT_str_offsets_base (0x00000008)
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  OSABI:           ELFOSABI_AMDGPU_PAL
+  Type:            ET_REL
+  Machine:         EM_AMDGPU
+  Flags:           [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         01110125251305032572171017110B120673178C0117000000
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         23000000050001080000000001000400010800000000000000005C000000080000000C00000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0C000000050000000000000004000000
+  - Name:            .rel.debug_info
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_info
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_abbrev
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0x11
+        Symbol:          .debug_str_offsets
+        Type:            R_AMDGPU_ABS32
+  - Name:            .rel.debug_str_offsets
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_str_offsets
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0xC
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .debug_abbrev
+      - Name:            .debug_info
+      - Name:            .rel.debug_info
+      - Name:            .debug_str_offsets
+      - Name:            .rel.debug_str_offsets
+      - Name:            .debug_str
+      - Name:            .symtab
+Symbols:
+  - Name:            .debug_abbrev
+    Type:            STT_SECTION
+    Section:         .debug_abbrev
+  - Name:            .debug_info
+    Type:            STT_SECTION
+    Section:         .debug_info
+  - Name:            .debug_str_offsets
+    Type:            STT_SECTION
+    Section:         .debug_str_offsets
+  - Name:            .debug_str
+    Type:            STT_SECTION
+    Section:         .debug_str
+DWARF:
+  debug_str:
+    - 'dxc'
+    - '.\example.hlsl'
+...
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
new file mode 100644
index 0000000000000..40b791fd27e32
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
@@ -0,0 +1,79 @@
+; Similar to DW_AT_object_pointer.ll but tests that we correctly
+; encode the object pointer index even if it's not the first argument
+; of the subprogram (which isn't something the major compilers do,
+; but is not mandated by DWARF).
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK
+
+; CHECK: DW_TAG_class_type
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK: DW_AT_object_pointer [DW_FORM_implicit_const] (2)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: "this"
+; CHECK: [[PARAM]]: DW_TAG_formal_parameter
+; CHECK: DW_AT_name
+; CHECK-SAME: = "this")
+; CHECK:   DW_TAG_formal_parameter
+
+%class.A = type { i8 }
+
+define linkonce_odr noundef ptr @_ZN1AC1Eii(ptr noundef nonnull returned align 1 dereferenceable(1) %this, i32 noundef %x, i32 noundef %y, i32 noundef %z) !dbg !24 {
+entry:
+  %this.addr = alloca ptr, align 8
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %z.addr = alloca i32, align 4
+  store ptr %this, ptr %this.addr, align 8
+    #dbg_declare(ptr %this.addr, !26, !DIExpression(), !28)
+  store i32 %x, ptr %x.addr, align 4
+    #dbg_declare(ptr %x.addr, !29, !DIExpression(), !30)
+  store i32 %y, ptr %y.addr, align 4
+    #dbg_declare(ptr %y.addr, !31, !DIExpression(), !32)
+  store i32 %z, ptr %y.addr, align 4
+    #dbg_declare(ptr %z.addr, !36, !DIExpression(), !37)
+  %this1 = load ptr, ptr %this.addr, align 8
+  %0 = load i32, ptr %x.addr, align 4, !dbg !33
+  %1 = load i32, ptr %y.addr, align 4, !dbg !33
+  %2 = load i32, ptr %z.addr, align 4, !dbg !33
+  ret ptr %this1, !dbg !34
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12, !13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 3, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!3 = !DIFile(filename: "object_ptr.cpp", directory: "/tmp")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !3, line: 1, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !6, identifier: "_ZTS1A")
+!6 = !{!7}
+!7 = !DISubprogram(name: "A", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPublic | DIFlagPrototyped, spFlags: 0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !11, !11, !10, !35}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 7, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{!"clang version 20.0.0git"}
+!24 = distinct !DISubprogram(name: "A", linkageName: "_ZN1AC1Eii", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, declaration: !7, retainedNodes: !25)
+!25 = !{}
+!26 = !DILocalVariable(name: "this", arg: 3, scope: !24, type: !27, flags: DIFlagArtificial | DIFlagObjectPointer)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64)
+!28 = !DILocation(line: 0, scope: !24)
+!29 = !DILocalVariable(name: "x", arg: 2, scope: !24, file: !3, line: 2, type: !11)
+!30 = !DILocation(line: 2, column: 19, scope: !24)
+!31 = !DILocalVariable(name: "y", arg: 1, scope: !24, file: !3, line: 2, type: !11)
+!32 = !DILocation(line: 2, column: 26, scope: !24)
+!33 = !DILocation(line: 2, column: 29, scope: !24)
+!34 = !DILocation(line: 2, column: 30, scope: !24)
+!35 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!36 = !DILocalVariable(name: "z", arg: 4, scope: !24, file: !3, line: 2, type: !35)
+!37 = !DILocation(line: 2, column: 35, scope: !24)
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
index d9988ac31451e..596727dce0433 100644
--- a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -1,14 +1,30 @@
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=gdb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-GDB
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=4 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF4
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF5
 
 ; CHECK: DW_TAG_formal_parameter [
 ; CHECK-NOT: ""
 ; CHECK: DW_TAG
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK-LLDB-DWARF5:             DW_AT_object_pointer [DW_FORM_implicit_const] (0)
+; CHECK-GDB-NOT:                 DW_AT_object_pointer
+; CHECK-LLDB-DWARF4-NOT:         DW_AT_object_pointer
+; CHECK: DW_TAG_formal_parameter
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
+; CHECK: DW_AT_name
+; CHECK-SAME        = "this")
 
 %class.A = type { i32 }
 
diff --git a/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
new file mode 100644
index 0000000000000..83904dcde938c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+coprocessor -disassemble %s | FileCheck -check-prefixes=CHECK-COPROCESSOR %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa coprocessor option enabled. Also verify that dissasembling without
+## Xtensa coprocessor option generates warnings.
+
+[0x20,0xe0,0x61]
+#CHECK-COPROCESSOR: xsr a2, cpenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
new file mode 100644
index 0000000000000..1321f09a973c3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+debug,+density -disassemble %s | FileCheck -check-prefixes=CHECK-DEBUG %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa debug option enabled. Also verify that dissasembling without
+## Xtensa debug option generates warnings.
+
+[0x10,0x41,0x00]
+# CHECK-DEBUG: break 1, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x2c,0xf1]
+# CHECK-DEBUG: break.n 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xe0,0x73,0x00]
+# CHECK-DEBUG: lddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xf0,0x73,0x00]
+# CHECK-DEBUG: sddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xec, 0x61]
+#CHECK-DEBUG: xsr a2, icount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xed, 0x61]
+#CHECK-DEBUG: xsr a2, icountlevel
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x60, 0x61]
+#CHECK-DEBUG: xsr a2, ibreakenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x68, 0x61]
+#CHECK-DEBUG: xsr a2, ddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x80, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x81, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x90, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x91, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa0, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa1, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/exception.txt b/llvm/test/MC/Disassembler/Xtensa/exception.txt
new file mode 100644
index 0000000000000..f40cc9e6549ba
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/exception.txt
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+exception -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa exception option enabled. Also verify that dissasembling without
+## Xtensa exception option generates warnings.
+
+[0x80,0x20,0x00]
+# CHECK-EXCEPTION: excw
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x50,0x00]
+# CHECK-EXCEPTION: syscall
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x30,0x00]
+# CHECK-EXCEPTION: rfe
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x32,0x00]
+# CHECK-EXCEPTION: rfde
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xb1, 0x61]
+#CHECK-INST: xsr a2, epc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xd1, 0x61]
+#CHECK-INST: xsr a2, excsave1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe8, 0x61]
+#CHECK-INST: xsr a2, exccause
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xee, 0x61]
+#CHECK-INST: xsr a2, excvaddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xc0, 0x61]
+#CHECK-INST: xsr a2, depc
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
new file mode 100644
index 0000000000000..d5d87918c9d52
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
@@ -0,0 +1,82 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+highpriinterrupts,+highpriinterrupts-level7 -disassemble %s | FileCheck -check-prefixes=CHECK-HPINTERRUPTS %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa highpriinterrupts option enabled. Also verify that dissasembling without
+## Xtensa highpriinterrupts option generates warnings.
+
+[0x10,0x31,0x00]
+# CHECK-HPINTERRUPTS: rfi 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/interrupt.txt b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
new file mode 100644
index 0000000000000..da8ea3aa5dc47
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
@@ -0,0 +1,26 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+interrupt -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa interrupt option enabled. Also verify that dissasembling without
+## Xtensa interrupt option generates warnings.
+
+[0x20,0x61,0x00]
+# CHECK-EXCEPTION: rsil a2, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x71,0x00]
+# CHECK-EXCEPTION: waiti 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe4, 0x61]
+#CHECK-INST: xsr a2, intenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe2, 0x03]
+#CHECK-INST: rsr a2, interrupt
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe3, 0x13]
+#CHECK-INST: wsr a2, intclear
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/prid.txt b/llvm/test/MC/Disassembler/Xtensa/prid.txt
new file mode 100644
index 0000000000000..104ad1c31185c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/prid.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+prid -disassemble %s | FileCheck -check-prefixes=CHECK-PRID %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa prid option enabled. Also verify that dissasembling without
+## Xtensa prid option generates warnings.
+
+[0x20,0xeb,0x03]
+#CHECK-PRID: rsr a2, prid
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/timer.txt b/llvm/test/MC/Disassembler/Xtensa/timer.txt
new file mode 100644
index 0000000000000..daacf27872daa
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/timer.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+timers3 -disassemble %s | FileCheck -check-prefixes=CHECK-TIMER %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa timer option enabled. Also verify that dissasembling without
+## Xtensa timer option generates warnings.
+
+[0x20,0xea,0x61]
+#CHECK-INST: xsr a2, ccount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf0,0x61]
+#CHECK-TIMER: xsr a2, ccompare0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf1,0x61]
+#CHECK-TIMER: xsr a2, ccompare1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf2,0x61]
+#CHECK-TIMER: xsr a2, ccompare2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
index 662c43e6bceaf..38eba7caf6106 100644
--- a/llvm/test/MC/LoongArch/Misc/cfi-advance.s
+++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
@@ -1,6 +1,8 @@
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.o
 # RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
 # RUN: llvm-dwarfdump --debug-frame %t.o | FileCheck --check-prefix=DWARFDUMP %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=RELAX %s
 
 # RELOC:       Relocations [
 # RELOC-NEXT:    .rela.eh_frame {
@@ -12,6 +14,16 @@
 # DWARFDUMP-NEXT:  DW_CFA_advance_loc: 8
 # DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
 
+# RELAX:       Relocations [
+# RELAX:         .rela.eh_frame {
+# RELAX-NEXT:       0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_ADD32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_SUB32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_ADD6 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_SUB6 .L{{.*}} 0x0
+# RELAX-NEXT:    }
+# RELAX-NEXT:  ]
+
         .text
         .globl test
         .p2align 2
diff --git a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
index 990e07c7f00bd..ab911d1853a87 100644
--- a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
+++ b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
@@ -1,5 +1,7 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 < %s \
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax < %s \
 # RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Ensure that the eh_frame records the symbolic difference with
 ## the R_LARCH_32_PCREL relocation.
@@ -12,3 +14,6 @@ func:
 # CHECK:   Section (4) .rela.eh_frame {
 # CHECK-NEXT:   0x1C R_LARCH_32_PCREL .text 0x0
 # CHECK-NEXT: }
+# RELAX:   Section ({{.*}}) .rela.eh_frame {
+# RELAX-NEXT:   0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT: }
diff --git a/llvm/test/MC/LoongArch/Relocations/sub-expr.s b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
index 0179e1027af8f..8bf046acc6975 100644
--- a/llvm/test/MC/LoongArch/Relocations/sub-expr.s
+++ b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
@@ -1,28 +1,95 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t
-# RUN: llvm-readobj -r %t | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Check that subtraction expressions emit R_LARCH_32_PCREL and R_LARCH_64_PCREL relocations.
 
 ## TODO: 1- or 2-byte data relocations are not supported for now.
 
 # CHECK:      Relocations [
-# CHECK-NEXT:   Section ({{.*}}) .rela.data {
-# CHECK-NEXT:     0x0 R_LARCH_64_PCREL sx 0x0
-# CHECK-NEXT:     0x8 R_LARCH_64_PCREL sy 0x0
-# CHECK-NEXT:     0x10 R_LARCH_32_PCREL sx 0x0
-# CHECK-NEXT:     0x14 R_LARCH_32_PCREL sy 0x0
-# CHECK-NEXT:   }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sx {
+# CHECK-NEXT:       0x4 R_LARCH_PCALA_HI20 z 0x0
+# CHECK-NEXT:       0x8 R_LARCH_PCALA_LO12 z 0x0
+# CHECK-NEXT:       0xC R_LARCH_32_PCREL .sy 0xC
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.data {
+# CHECK-NEXT:       0x0 R_LARCH_64_PCREL .sx 0x4
+# CHECK-NEXT:       0x8 R_LARCH_64_PCREL .sy 0x4
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x4
+# CHECK-NEXT:       0x14 R_LARCH_32_PCREL .sy 0x4
+# CHECK-NEXT:       0x18 R_LARCH_ADD64 .sx 0x4
+# CHECK-NEXT:       0x18 R_LARCH_SUB64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_ADD64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_ADD32 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_ADD32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_SUB32 .sx 0x4
+# CHECK-NEXT:       0x30 R_LARCH_ADD64 .data 0x30
+# CHECK-NEXT:       0x30 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x38 R_LARCH_ADD32 .data 0x38
+# CHECK-NEXT:       0x38 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sy {
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x10
+# CHECK-NEXT:     }
+# CHECK-NEXT:   ]
 
-.section sx,"a"
-x:
+# RELAX:      Relocations [
+# RELAX-NEXT:   Section ({{.*}}) .rela.sx {
+# RELAX-NEXT:     0x4 R_LARCH_PCALA_HI20 z 0x0
+# RELAX-NEXT:     0x4 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0x8 R_LARCH_PCALA_LO12 z 0x0
+# RELAX-NEXT:     0x8 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0xC R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0xC R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.data {
+# RELAX-NEXT:     0x0 R_LARCH_64_PCREL x 0x0
+# RELAX-NEXT:     0x8 R_LARCH_64_PCREL y 0x0
+# RELAX-NEXT:     0x10 R_LARCH_32_PCREL x 0x0
+# RELAX-NEXT:     0x14 R_LARCH_32_PCREL y 0x0
+# RELAX-NEXT:     0x18 R_LARCH_ADD64 x 0x0
+# RELAX-NEXT:     0x18 R_LARCH_SUB64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_ADD64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:     0x30 R_LARCH_ADD64 {{.*}} 0x0
+# RELAX-NEXT:     0x30 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x38 R_LARCH_ADD32 {{.*}} 0x0
+# RELAX-NEXT:     0x38 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.sy {
+# RELAX-NEXT:     0x4 R_LARCH_ALIGN - 0xC
+# RELAX-NEXT:     0x10 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x10 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT: ]
+
+.section .sx,"ax"
 nop
+x:
+la.pcrel $a0, z
+.4byte y-x
 
 .data
 .8byte x-.
 .8byte y-.
 .4byte x-.
 .4byte y-.
+.8byte x-y
+.8byte y-x
+.4byte x-y
+.4byte y-x
+.8byte .-x
+.4byte .-y
 
-.section sy,"a"
-y:
+.section .sy,"ax"
 nop
+y:
+.p2align 4
+.4byte x-y
diff --git a/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
new file mode 100644
index 0000000000000..355846719e46f
--- /dev/null
+++ b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
@@ -0,0 +1,27 @@
+# XAndesVBFHCvt - Andes Vector BFLOAT16 Conversion Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+# CHECK-OBJ: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: encoding: [0x5b,0x44,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfwcvt.s.bf16 v8, v10
+
+# CHECK-OBJ: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: encoding: [0x5b,0xc4,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfncvt.bf16.s v8, v10
diff --git a/llvm/test/MC/Xtensa/Core/processor-control.s b/llvm/test/MC/Xtensa/Core/processor-control.s
index 5b648356fc68b..4a37d8346893e 100644
--- a/llvm/test/MC/Xtensa/Core/processor-control.s
+++ b/llvm/test/MC/Xtensa/Core/processor-control.s
@@ -20,6 +20,11 @@ esync
 # CHECK: encoding: [0x00,0x20,0x00]
 isync
 
+# Instruction format CALLX
+# CHECK-INST: ill
+# CHECK: encoding: [0x00,0x00,0x00]
+ill
+
 # Instruction format RRR
 # CHECK-INST: nop
 # CHECK: encoding: [0xf0,0x20,0x00]
diff --git a/llvm/test/MC/Xtensa/coprocessor.s b/llvm/test/MC/Xtensa/coprocessor.s
new file mode 100644
index 0000000000000..dca8c55fd72cb
--- /dev/null
+++ b/llvm/test/MC/Xtensa/coprocessor.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+coprocessor \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2,cpenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr.cpenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2, 224
diff --git a/llvm/test/MC/Xtensa/debug-invalid.s b/llvm/test/MC/Xtensa/debug-invalid.s
new file mode 100644
index 0000000000000..74f0df9fe8148
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple xtensa --mattr=+debug,+density %s 2>&1 | FileCheck %s
+
+LBL0:
+
+# Out of range immediates
+
+# uimm4
+break 16, 0
+# CHECK: :[[#@LINE-1]]:7: error: expected immediate in range [0, 15]
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
new file mode 100644
index 0000000000000..36b1f110d120b
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -0,0 +1,190 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+debug,+density \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: break 1, 1
+# CHECK: encoding: [0x10,0x41,0x00]
+break 1, 1
+
+# Instruction format RRRN
+# CHECK-INST: break.n 1
+# CHECK: encoding: [0x2c,0xf1]
+break.n 1
+
+# Instruction format RRR
+# CHECK-INST: lddr32.p a3
+# CHECK: encoding: [0xe0,0x73,0x00]
+lddr32.p a3
+
+# Instruction format RRR
+# CHECK-INST: sddr32.p a3
+# CHECK: encoding: [0xf0,0x73,0x00]
+sddr32.p a3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2,icount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr.icount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2, 236
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2,icountlevel
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr.icountlevel a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2, 237
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2,ibreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr.ibreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2, 128
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2,ibreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr.ibreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2, 129
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2,dbreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr.dbreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2, 144
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2,dbreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr.dbreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2, 145
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2,dbreakc0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr.dbreakc0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2, 160
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2,dbreakc1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr.dbreakc1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2, 161
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2,ibreakenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr.ibreakenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2, 96
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2,debugcause
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr.debugcause a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2, 233
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2,ddr
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr.ddr a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2, 104
diff --git a/llvm/test/MC/Xtensa/exception.s b/llvm/test/MC/Xtensa/exception.s
new file mode 100644
index 0000000000000..7084ddacf0136
--- /dev/null
+++ b/llvm/test/MC/Xtensa/exception.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+exception \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: excw
+# CHECK: encoding: [0x80,0x20,0x00]
+excw
+
+# Instruction format RRR
+# CHECK-INST: syscall
+# CHECK: encoding: [0x00,0x50,0x00]
+syscall
+
+# Instruction format RRR
+# CHECK-INST: rfe
+# CHECK: encoding: [0x00,0x30,0x00]
+rfe
+
+# Instruction format RRR
+# CHECK-INST: rfde
+# CHECK: encoding: [0x00,0x32,0x00]
+rfde
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, epc1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr.epc1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, 177
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, excsave1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr.excsave1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, 209
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, exccause
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr.exccause a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, 232
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, excvaddr
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr.excvaddr a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, 238
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, depc
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr.depc a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, 192
diff --git a/llvm/test/MC/Xtensa/highinterrupts.s b/llvm/test/MC/Xtensa/highinterrupts.s
new file mode 100644
index 0000000000000..4908176b1b030
--- /dev/null
+++ b/llvm/test/MC/Xtensa/highinterrupts.s
@@ -0,0 +1,280 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+highpriinterrupts,+highpriinterrupts-level7 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rfi 1
+# CHECK: encoding: [0x10,0x31,0x00]
+rfi 1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2,epc2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr.epc2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2, 178
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2,epc3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr.epc3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2, 179
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2,epc4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr.epc4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2, 180
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2,epc5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr.epc5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2, 181
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2,epc6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr.epc6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2, 182
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2,epc7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr.epc7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2, 183
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2,eps2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr.eps2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2, 194
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2,eps3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr.eps3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2, 195
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2,eps4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr.eps4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2, 196
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2,eps5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr.eps5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2, 197
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2,eps6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr.eps6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2, 198
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2,eps7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr.eps7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2, 199
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2,excsave2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr.excsave2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2, 210
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2,excsave3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr.excsave3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2, 211
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2,excsave4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr.excsave4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2, 212
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2,excsave5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr.excsave5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2, 213
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2,excsave6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr.excsave6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2, 214
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2,excsave7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr.excsave7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2, 215
diff --git a/llvm/test/MC/Xtensa/interrupt.s b/llvm/test/MC/Xtensa/interrupt.s
new file mode 100644
index 0000000000000..cb1b82dbfe5aa
--- /dev/null
+++ b/llvm/test/MC/Xtensa/interrupt.s
@@ -0,0 +1,60 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+interrupt \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rsil a2, 1
+# CHECK: encoding: [0x20,0x61,0x00]
+rsil a2, 1
+
+# Instruction format RRR
+# CHECK-INST: waiti 1
+# CHECK: encoding: [0x00,0x71,0x00]
+waiti 1
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, interrupt
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr.interrupt a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, 226
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, intclear
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr.intclear a2
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, 227
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, intenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr.intenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, 228
diff --git a/llvm/test/MC/Xtensa/prid.s b/llvm/test/MC/Xtensa/prid.s
new file mode 100644
index 0000000000000..75fcc151e8eff
--- /dev/null
+++ b/llvm/test/MC/Xtensa/prid.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+prid \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2,prid
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr.prid a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2, 235
diff --git a/llvm/test/MC/Xtensa/timer.s b/llvm/test/MC/Xtensa/timer.s
new file mode 100644
index 0000000000000..f1fc9709cdec9
--- /dev/null
+++ b/llvm/test/MC/Xtensa/timer.s
@@ -0,0 +1,65 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+timers3 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2,ccount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr.ccount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2, 234
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2,ccompare0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr.ccompare0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2, 240
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2,ccompare1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr.ccompare1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2, 241
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2,ccompare2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr.ccompare2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2, 242
diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
new file mode 100644
index 0000000000000..ace68a19bf41f
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
@@ -0,0 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=attributor %s -S | FileCheck %s --check-prefix=AMDGCN
+
+@G = addrspace(1) global i32 zeroinitializer, align 4
+declare void @clobber(i32) #0
+declare ptr addrspace(1) @get_ptr() #0
+declare noalias ptr addrspace(1) @get_noalias_ptr() #0
+declare noalias ptr addrspace(1) @get_untouched_ptr() #1
+
+define void @test_nonkernel(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define void @test_nonkernel(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6:[0-9]+]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as the caller may modify %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_plain(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_plain(
+; AMDGCN-SAME: ptr addrspace(1) nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as %ptr may alias a pointer in @clobber
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_ptr(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_ptr(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0:![0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_gep(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_gep(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_gep(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_gep(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_swap(ptr addrspace(1) noalias %ptr, i32 inreg %swap) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_swap(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]], i32 inreg [[SWAP:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    store i32 [[SWAP]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be !invariant.load due to the write to %ptr
+  store i32 %swap, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_volatile(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_volatile(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load volatile i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load volatile i32, ptr addrspace(1) %ptr, align 4
+  ;; volatiles loads cannot be !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_unordered(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_unordered(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] unordered, align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr unordered, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_monotonic(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_monotonic(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] monotonic, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr monotonic, align 4
+  ;; atomic loads with ordering guarantees may have side effects
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_global() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_global(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) @G, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) @G, align 4
+  ;; is not an !invariant.load as global variables may change
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_noalias_load(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_noalias_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; is an !invariant.load due to its only caller @test_call_internal_noalias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_noalias_load(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7:[0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_noalias_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_load(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since the pointer in @test_call_internal may alias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_load(ptr addrspace(1) nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_written(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be an !invariant.load because of the write in caller @test_call_internal_written
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_written(ptr addrspace(1) noalias %ptr, i32 inreg %x) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree captures(none) [[PTR:%.*]], i32 inreg [[X:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_written(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    store i32 [[X]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_written(ptr addrspace(1) %ptr)
+  store i32 %x, ptr addrspace(1) %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_noalias_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_noalias_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_noalias_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_noalias_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may have been written to before returning
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_untouched_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_untouched_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call noalias align 4 ptr addrspace(1) @get_untouched_ptr() #[[ATTR8:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_untouched_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  ;; original %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load_partial_noalias(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load_partial_noalias(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; %ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load_partial_noalias(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load_partial_noalias(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind willreturn }
+attributes #1 = { nofree norecurse nosync nounwind willreturn readonly }
+;.
+; AMDGCN: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 07e2d5ea15752..5bff2a2e6b208 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -207,7 +207,6 @@ define void @f7_1(ptr %ptr, i1 %cnd) {
 ; CHECK-LABEL: define {{[^@]+}}@f7_1
 ; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
-; CHECK-NEXT:    [[PTR_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
index 374d5ba7ff52b..4767244800d21 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
@@ -135,7 +135,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; TUNIT-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR5:[0-9]+]]
-; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; TUNIT-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -145,7 +145,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; CGSCC-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; CGSCC-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 entry:
@@ -234,7 +234,7 @@ define internal %S @bar.5(ptr %this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
-; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -244,7 +244,7 @@ define internal %S @bar.5(ptr %this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR9:[0-9]+]]
-; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 entry:
@@ -286,7 +286,7 @@ define internal void @boom(ptr %this, ptr %data) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[DATA_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[DATA]], ptr [[DATA_ADDR]], align 8
-; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8
+; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    store ptr [[V]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    ret void
 ;
@@ -342,14 +342,6 @@ define %S.2 @t3.helper() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[S_2:%.*]], align 8
 ; CHECK-NEXT:    call void @ext1(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]])
-; CHECK-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load ptr, ptr [[RETVAL]], align 8
-; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[S_2]] poison, ptr [[DOTFCA_0_LOAD]], 0
-; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 1
-; CHECK-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load i64, ptr [[DOTFCA_1_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_0_INSERT]], i64 [[DOTFCA_1_LOAD]], 1
-; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 2
-; CHECK-NEXT:    [[DOTFCA_2_LOAD:%.*]] = load i64, ptr [[DOTFCA_2_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_1_INSERT]], i64 [[DOTFCA_2_LOAD]], 2
 ; CHECK-NEXT:    ret [[S_2]] zeroinitializer
 ;
 entry:
@@ -508,7 +500,7 @@ define internal %S @t4a(ptr %this) {
 ; CGSCC-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @t4b(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[TMP0]]
 ;
 entry:
@@ -623,6 +615,7 @@ entry:
 ; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; TUNIT: [[META8]] = !{}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -632,4 +625,5 @@ entry:
 ; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CGSCC: [[META8]] = !{}
 ;.
diff --git a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
index 030d315bfd925..df2feb087e397 100644
--- a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
@@ -286,3 +286,35 @@ define void @test_dse_non_alloca() {
   ret void
 }
 
+define void @test_other_read_effects() {
+; CHECK-LABEL: @test_other_read_effects(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  ret void
+}
+
+define i32 @test_other_read_effects_read_after() {
+; CHECK-LABEL: @test_other_read_effects_read_after(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  %v = load i32, ptr %a
+  ret i32 %v
+}
+
+define void @test_other_write_effects() {
+; CHECK-LABEL: @test_other_write_effects(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(write, argmem: readwrite) nounwind willreturn
+  ret void
+}
diff --git a/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
new file mode 100644
index 0000000000000..6995b97e79887
--- /dev/null
+++ b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -passes=elim-avail-extern -avail-extern-gv-in-addrspace-to-local=3 %s -o - | FileCheck %s
+
+@shared = internal addrspace(3) global i32 undef, align 4
+@shared.imported = available_externally hidden unnamed_addr addrspace(3) global i32 undef, align 4
+
+;.
+; CHECK: @shared = internal addrspace(3) global i32 undef, align 4
+; CHECK: @shared.imported.__uniq.[[UUID:.*]] = internal unnamed_addr addrspace(3) global i32 undef, align 4
+;.
+define void @foo(i32 %v) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[V:%.*]]) {
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared, align 4
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared.imported.__uniq.[[UUID]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %v, ptr addrspace(3) @shared, align 4
+  store i32 %v, ptr addrspace(3) @shared.imported, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll
index 7c3f16917bc97..8fb2d5756f95d 100644
--- a/llvm/test/Transforms/GVN/opt-remarks.ll
+++ b/llvm/test/Transforms/GVN/opt-remarks.ll
@@ -62,6 +62,19 @@
 ; YAML-NEXT:   - ClobberedBy:     store
 ; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            gvn
+; YAML-NEXT: Name:            LoadClobbered
+; YAML-NEXT: Function:        lifetime_end
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'load of type '
+; YAML-NEXT:   - Type:            i8
+; YAML-NEXT:   - String:          ' not eliminated'
+; YAML-NEXT:   - String:          ' in favor of '
+; YAML-NEXT:   - OtherAccess:     store
+; YAML-NEXT:   - String:          ' because it is clobbered by '
+; YAML-NEXT:   - ClobberedBy:     call llvm.lifetime.end.p0
+; YAML-NEXT: ...
 
 define i32 @arg(ptr %p, i32 %i) {
 entry:
@@ -93,6 +106,15 @@ entry:
   %add = add i32 %load1, %load
   ret i32 %add
 }
+
+define i8 @lifetime_end(ptr %p, i8 %val) {
+  call void @llvm.lifetime.start.p0(i64 32, ptr %p)
+  store i8 %val, ptr %p
+  call void @llvm.lifetime.end.p0(i64 32, ptr %p)
+  %1 = load i8, ptr %p
+  ret i8 %1
+}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
new file mode 100644
index 0000000000000..fb2fdb116f904
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p indvars -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+
+declare void @use(ptr)
+declare void @use.i64(i64)
+
+define i64 @test_simplifycompare_rhs_constant(i64 %num_bytes, ptr %src) {
+; CHECK-LABEL: define i64 @test_simplifycompare_rhs_constant(
+; CHECK-SAME: i64 [[NUM_BYTES:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ne i64 [[NUM_BYTES]], 0
+; CHECK-NEXT:    [[COND_I:%.*]] = zext i1 [[CMP_NOT_I]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ule i64 [[IV]], [[COND_I]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_0]])
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    call void @use(ptr [[SRC]])
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+entry:
+  %cmp.not.i = icmp ne i64 %num_bytes, 0
+  %cond.i = zext i1 %cmp.not.i to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c.0 = icmp ule i64 %iv, %cond.i
+  tail call void @llvm.assume(i1 %c.0)
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %c.1 = icmp eq i32 %l, 0
+  br i1 %c.1, label %then, label %loop.latch
+
+then:
+  call void @use(ptr %src)
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop
+}
+
+define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    call void @use(ptr [[P]])
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = alloca i64, align 8
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop ]
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 -8
+  call void @use(ptr %ptr.iv)
+  %ec = icmp ult ptr %ptr.iv.next, %p
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_simplifycompare_rhs_not_constant2(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOP_PREHEADER:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[EXIT_LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %[[OUTER_LATCH_PREHEADER]] ], [ [[X]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_LCSSA]], %[[EXIT_LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %exit.loop, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ 0, %outer.latch.preheader ], [ %x, %outer.latch ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+exit.loop:
+  %iv.2 = phi i32 [ %iv.1, %outer.header ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = zext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}
+
+define void @test_simplifycompare_rhs_addrec(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_addrec(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 2, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[OUTER_EXIT:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[X]], %[[OUTER_LATCH]] ], [ 0, %[[OUTER_LATCH_PREHEADER]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[OUTER_EXIT]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ], [ [[INDVARS_IV_LCSSA]], %[[OUTER_EXIT]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 2, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %outer.exit, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nuw nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ %x, %outer.latch ], [ 0, %outer.latch.preheader ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+outer.exit:
+  %sub = add nsw i32 %iv.1, -2
+  br label %exit.loop
+
+exit.loop:
+  %iv.2 = phi i32 [ %sub, %outer.exit ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = sext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index 30431ad724843..ee5ccf5af987d 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 
 define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -121,6 +122,123 @@ main_body:
   ret half %addf_sum.2
 }
 
+define amdgpu_ps half @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %h2
+  ret half %res
+}
+
+define amdgpu_ps half @image_sample_2d_extractelement_multi_use_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX7-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX81PLUS-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX81PLUS-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %user2 = fadd float %e0, 1.0
+  %half = fptrunc float %user2 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %half
+  ret half %res
+}
+
+define amdgpu_ps bfloat @image_sample_2d_multi_fptrunc_non_half_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX7-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX7-NEXT:    ret bfloat [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX81PLUS-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret bfloat [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to bfloat
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to bfloat
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to bfloat
+  %mul = fmul bfloat %h0, %h1
+  %res = fadd bfloat %mul, %h2
+  ret bfloat %res
+}
+
 define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX7-LABEL: @image_gather4_2d_v4f32(
 ; GFX7-NEXT:  main_body:
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 495f99824652d..a16e30bb49452 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -4273,4 +4273,265 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) {
 }
 
 declare void @llvm.assume(i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N)
+; This is only valid when x + (N-1) doesn't overflow
+
+; Test with known range that prevents overflow
+define i32 @ceil_div_by_8_known_range(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_by_8_known_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Test with the exact IR from the original testcase
+define i32 @ceil_div_from_clz(i32 %v) {
+; CHECK-LABEL: @ceil_div_from_clz(
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
+  %sub = sub nuw nsw i32 32, %ctlz
+  %shr = lshr i32 %sub, 3
+  %and = and i32 %sub, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw nsw i32 %shr, %ext
+  ret i32 %r
+}
+
+; Vector version with known range
+define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_8_vec_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7)
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+; Ceiling division by 16 with known range
+define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15
+; CHECK-NEXT:    [[R:%.*]] = lshr i16 [[TMP1]], 4
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %shr = lshr i16 %x, 4
+  %and = and i16 %x, 15
+  %cmp = icmp ne i16 %and, 0
+  %ext = zext i1 %cmp to i16
+  %r = add i16 %shr, %ext
+  ret i16 %r
+}
+
+; Negative test: no overflow guarantee - should NOT optimize
+define i32 @ceil_div_by_8_no_overflow_info(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_no_overflow_info(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: nuw on final add doesn't help
+define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw i32 %shr, %ext  ; nuw here doesn't prove x+7 won't overflow
+  ret i32 %r
+}
+
+; Negative test: wrong mask
+define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_mask(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 6
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 6  ; Wrong mask: should be 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong shift amount
+define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_shift(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 4  ; Shift by 4, but mask is 7 (should be 15)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong comparison
+define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_cmp(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp eq i32 %and, 0  ; Wrong: should be ne
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use test: all intermediate values have uses
+define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    call void @use_i32(i32 [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  call void @use_i32(i32 %and)
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Commuted test: add operands are swapped  
+define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Commuted with multi-use
+define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Multi-use test where only zext has multiple uses - should still optimize
+define i32 @ceil_div_zext_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_zext_multi_use(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use with vector type
+define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_vec_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3)
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  call void @use_vec(<2 x i32> %shr)
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+declare void @use_i32(i32)
+declare void @use_vec(<2 x i32>)
 declare void @fake_func(i32)
diff --git a/llvm/test/Transforms/InstCombine/icmp-subadd.ll b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
new file mode 100644
index 0000000000000..fd7e1250d893f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-slt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-slt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp slt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sle(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sle(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sle i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}
+
+; Check not folded
+define i1 @test-add-sub-nuw-icmp-sge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-sge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[ADD]], [[SUB]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp sge i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-swap-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-swap-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %b, %a
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index dd75560e25ced..061c6834eb97d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -441,3 +441,39 @@ define i128 @load-128bit(){
   %1 = load i128, ptr @global128, align 4
   ret i128 %1
 }
+
+
+@i40_struct = constant { i40, i8 } { i40 0, i8 1 }
+@i40_array = constant [2 x i40] [i40 0, i40 1]
+
+define i8 @load_i40_struct_padding() {
+; CHECK-LABEL: @load_i40_struct_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_struct, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_struct_partial_padding() {
+; CHECK-LABEL: @load_i40_struct_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_struct, i64 4)
+  ret i16 %v
+}
+
+define i8 @load_i40_array_padding() {
+; CHECK-LABEL: @load_i40_array_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_array, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_array_partial_padding() {
+; CHECK-LABEL: @load_i40_array_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_array, i64 4)
+  ret i16 %v
+}
diff --git a/llvm/test/Transforms/InstSimplify/vp-reverse.ll b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
index 3c3bb871dc610..f19a2ac8ca9e1 100644
--- a/llvm/test/Transforms/InstSimplify/vp-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
@@ -3,9 +3,7 @@
 
 define <vscale x 4 x i32> @rev_of_rev(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_rev(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -25,8 +23,7 @@ define <vscale x 4 x i32> @rev_of_rev_diffevl(<vscale x 4 x i32> %a, i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 ; CHECK-LABEL: @rev_of_poison(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -34,8 +31,7 @@ define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 ; CHECK-LABEL: @rev_of_undef(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -43,8 +39,7 @@ define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_zero(i32 %evl) {
 ; CHECK-LABEL: @rev_of_zero(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -54,8 +49,7 @@ define <vscale x 4 x i32> @rev_of_splat(i32 %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_splat(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -67,8 +61,7 @@ define <vscale x 4 x i32> @rev_of_splat2(i32 %a, <vscale x 4 x i1> %m, i32 %evl)
 ; CHECK-LABEL: @rev_of_splat2(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LICM/funclet.ll b/llvm/test/Transforms/LICM/funclet.ll
index cacb0c90d3702..1cdd12ddc98e7 100644
--- a/llvm/test/Transforms/LICM/funclet.ll
+++ b/llvm/test/Transforms/LICM/funclet.ll
@@ -14,7 +14,7 @@ define void @test1(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[DOTLCSSA1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %catch] unwind to caller
@@ -59,7 +59,7 @@ define void @test2(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @pure_computation() [ "funclet"(token [[CP]]) ]
@@ -114,10 +114,10 @@ define void @test3(i1 %a, i1 %b, i1 %c) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[CS]] = catchswitch within none [label %catch.object.Throwable] unwind to caller
 ; CHECK:       forbody:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
+; CHECK-NEXT:            to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
 ;
 entry:
   %.frame = alloca i8, align 4
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
new file mode 100644
index 0000000000000..5976658ccdf86
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-none-elf | FileCheck %s
+
+; Check that the load in the loop has postindex addressing, regardless of the
+; type or whether the input uses postindex or offset addressing.
+
+define i32 @i32_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB0_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB0_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB0_5
+; CHECK-NEXT:  // %bb.3: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load i32, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.inc ]
+  ret i32 %ret
+}
+
+define i32 @i32_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB1_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB1_5
+; CHECK-NEXT:  // %bb.3: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB1_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.cond ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.cond ]
+  ret i32 %ret
+}
+
+define float @float_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  .LBB2_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB2_4
+; CHECK-NEXT:  // %bb.2: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.inc ], [ 0.000000e+00, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load float, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.inc ]
+  ret float %ret
+}
+
+define float @float_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB3_3
+; CHECK-NEXT:  .LBB3_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB3_4
+; CHECK-NEXT:  // %bb.2: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.cond ], [ 0.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %p, i64 %iv
+  %val = load float, ptr %arrayidx, align 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.cond ]
+  ret float %ret
+}
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
index cd098e123b5f6..5e8540814fff2 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
@@ -41,16 +41,27 @@ define i32 @smin_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -74,16 +85,28 @@ define i32 @smax_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nuw nsw i32 1024, [[IV1]]
+; CHECK-NEXT:    call void @foo(i32 [[SUB1]])
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -135,3 +158,8 @@ exit:
   ret i32 %minmax.lcssa
 }
 
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ae7719757dc30..24c703ae42f0a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -16,9 +16,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -36,8 +36,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
@@ -100,9 +100,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -120,8 +120,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f36161703dba5..976f95ff4f0ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -862,8 +862,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; DEFAULT-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE14]]:
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[EXIT:.*]]
@@ -964,8 +964,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; PRED-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; PRED:       [[PRED_STORE_CONTINUE14]]:
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 4775a6ec3f917..d59607711b5bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -123,9 +123,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
@@ -246,9 +246,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 5508a65744c6b..895781de31f33 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -87,12 +87,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP1]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
@@ -184,16 +184,16 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT10]], zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
-; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT14]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT14]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
 ; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
@@ -207,12 +207,12 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL21:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX22:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL23:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX22]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL23]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX24]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[START]], 0
 ; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-NEXT:    [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 4d3afd71921d6..e4718dc216358 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -394,9 +394,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -514,13 +514,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; DEFAULT-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -590,13 +590,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; OPTSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; OPTSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -666,13 +666,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; MINSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; MINSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 74b0c2c0e033a..d02d03b4b437d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -120,8 +120,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
new file mode 100644
index 0000000000000..f8551d774de49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
+; CHECK-NEXT:    [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP31]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP20]]
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+  %l = load i8, ptr %gep.src.i.i, align 1
+  %l.ext = zext i8 %l to i32
+  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
+  store i8 0, ptr %gep.dst, align 1
+  %min.ext = zext i32 %min.1 to i64
+  %red.next = or i64 %red, %min.ext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not.i.i = icmp eq i64 %iv.next, 16
+  br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+declare i32 @llvm.umin.i32(i32, i32)
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 49584bd47353d..f44744071ae58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -20,11 +20,11 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP7]], splat (i7 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -85,11 +85,11 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP7]], splat (i3 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 77e713256d247..7e4edf739695a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -101,11 +101,11 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -185,11 +185,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -579,9 +579,9 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> splat (i64 1023), [[TMP2]]
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i64 0, [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[DOTNEG]], i64 0
@@ -809,9 +809,9 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -958,9 +958,9 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1113,13 +1113,13 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
@@ -1191,13 +1191,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP21]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP7]], 3
@@ -1284,14 +1284,14 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP9]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP33]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i32 [[TMP16]], -1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP34]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3567aff0ace4e..bd2bd5aa27952 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -37,11 +37,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -60,10 +60,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -83,11 +83,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -108,11 +108,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -182,11 +182,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -201,7 +201,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -221,11 +221,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -243,7 +243,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -309,13 +309,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -331,7 +331,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -352,13 +352,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -378,7 +378,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -456,11 +456,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -483,10 +483,10 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -506,11 +506,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -535,11 +535,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 022789ad9de70..fea57fa8b6b68 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -107,8 +107,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; VF4:       [[PRED_STORE_CONTINUE6]]:
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ; VF4-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
index c6ea44bb85f11..670b08987c81e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(cos|sin|tan|cbrt|erf|exp[^e]|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=SLEEF-SVE-NOPRED
@@ -19,6 +22,18 @@ declare double @acos(double)
 declare float @acosf(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -64,6 +79,18 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -112,6 +139,18 @@ declare double @acosh(double)
 declare float @acoshf(float)
 
 define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -157,6 +196,18 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acoshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acoshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -205,6 +256,18 @@ declare double @asin(double)
 declare float @asinf(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -250,6 +313,18 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -298,6 +373,18 @@ declare double @asinh(double)
 declare float @asinhf(float)
 
 define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +430,18 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -391,6 +490,18 @@ declare double @atan(double)
 declare float @atanf(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -436,6 +547,18 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -484,6 +607,18 @@ declare double @atan2(double, double)
 declare float @atan2f(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -529,6 +664,18 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -577,6 +724,18 @@ declare double @atanh(double)
 declare float @atanhf(float)
 
 define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -622,6 +781,18 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -670,6 +841,18 @@ declare double @cbrt(double)
 declare float @cbrtf(float)
 
 define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cbrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -715,6 +898,18 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cbrtf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cbrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -763,6 +958,18 @@ declare double @copysign(double, double)
 declare float @copysignf(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_copysign(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -808,6 +1015,18 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_copysignf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -856,6 +1075,18 @@ declare double @cos(double)
 declare float @cosf(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -901,6 +1132,18 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -949,6 +1192,18 @@ declare double @cosh(double)
 declare float @coshf(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -994,6 +1249,18 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1042,6 +1309,18 @@ declare double @cospi(double)
 declare float @cospif(float)
 
 define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cospi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1087,6 +1366,18 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cospif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1135,6 +1426,18 @@ declare double @erf(double)
 declare float @erff(float)
 
 define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erf(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1180,6 +1483,18 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erff(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erff(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1228,6 +1543,18 @@ declare double @erfc(double)
 declare float @erfcf(float)
 
 define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erfc(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1273,6 +1600,18 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erfcf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erfcf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1321,6 +1660,18 @@ declare double @exp(double)
 declare float @expf(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1366,6 +1717,18 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1414,6 +1777,18 @@ declare double @exp10(double)
 declare float @exp10f(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1459,6 +1834,18 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1507,6 +1894,18 @@ declare double @exp2(double)
 declare float @exp2f(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1552,6 +1951,18 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1600,6 +2011,18 @@ declare double @expm1(double)
 declare float @expm1f(float)
 
 define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_expm1(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1645,6 +2068,18 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expm1f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expm1f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1693,6 +2128,18 @@ declare double @fdim(double, double)
 declare float @fdimf(float, float)
 
 define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fdim(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1738,6 +2185,18 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fdimf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1786,6 +2245,18 @@ declare double @fma(double, double, double)
 declare float @fmaf(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vvv_fma(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1831,6 +2302,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vvv_fmaf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1879,6 +2362,18 @@ declare double @fmax(double, double)
 declare float @fmaxf(float, float)
 
 define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmax(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1924,6 +2419,18 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmaxf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1972,6 +2479,18 @@ declare double @fmin(double, double)
 declare float @fminf(float, float)
 
 define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmin(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2017,6 +2536,18 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fminf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2065,6 +2596,18 @@ declare double @fmod(double, double)
 declare float @fmodf(float, float)
 
 define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2110,6 +2653,18 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2158,6 +2713,18 @@ declare double @hypot(double, double)
 declare float @hypotf(float, float)
 
 define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_hypot(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2203,6 +2770,18 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_hypotf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_hypotf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2251,6 +2830,18 @@ declare i32 @ilogb(double)
 declare i32 @ilogbf(float)
 
 define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x i32> @_ZGVnN2v_ilogb(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2296,6 +2887,18 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x i32> @_ZGVnN4v_ilogbf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2344,6 +2947,18 @@ declare double @ldexp(double, i32)
 declare float @ldexpf(float, i32)
 
 define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <2 x double> @_ZGVnN2vv_ldexp(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]])
@@ -2391,6 +3006,18 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 }
 
 define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <4 x float> @_ZGVnN4vv_ldexpf(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]])
@@ -2441,6 +3068,18 @@ declare double @lgamma(double)
 declare float @lgammaf(float)
 
 define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2486,6 +3125,18 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2534,6 +3185,18 @@ declare double @log(double)
 declare float @logf(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2579,6 +3242,18 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2627,6 +3302,18 @@ declare double @log10(double)
 declare float @log10f(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2672,6 +3359,18 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2720,6 +3419,18 @@ declare double @log1p(double)
 declare float @log1pf(float)
 
 define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log1p(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2765,6 +3476,18 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log1pf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log1pf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2813,6 +3536,18 @@ declare double @log2(double)
 declare float @log2f(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2858,6 +3593,18 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2909,6 +3656,18 @@ declare double @modf(double, ptr)
 declare float @modff(float, ptr)
 
 define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -2953,6 +3712,18 @@ for.cond.cleanup:
 }
 
 define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -3000,6 +3771,18 @@ declare double @nextafter(double, double)
 declare float @nextafterf(float, float)
 
 define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_nextafter(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3045,6 +3828,18 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_nextafterf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3093,6 +3888,18 @@ declare double @pow(double, double)
 declare float @powf(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3138,6 +3945,18 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3186,6 +4005,18 @@ declare double @sin(double)
 declare float @sinf(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3231,6 +4062,18 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3282,6 +4125,18 @@ declare void @sincos(double, ptr, ptr)
 declare void @sincosf(float, ptr, ptr)
 
 define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3325,6 +4180,18 @@ for.cond.cleanup:
 }
 
 define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3374,6 +4241,18 @@ declare void @sincospi(double, ptr, ptr)
 declare void @sincospif(float, ptr, ptr)
 
 define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3417,6 +4296,18 @@ for.cond.cleanup:
 }
 
 define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3463,6 +4354,18 @@ declare double @sinh(double)
 declare float @sinhf(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3508,6 +4411,18 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3556,6 +4471,18 @@ declare double @sinpi(double)
 declare float @sinpif(float)
 
 define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinpi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3601,6 +4528,18 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinpif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3649,6 +4588,18 @@ declare double @sqrt(double)
 declare float @sqrtf(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3694,6 +4645,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3742,6 +4705,18 @@ declare double @tan(double)
 declare float @tanf(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3787,6 +4762,18 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3835,6 +4822,18 @@ declare double @tanh(double)
 declare float @tanhf(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3880,6 +4879,18 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3928,6 +4939,18 @@ declare double @tgamma(double)
 declare float @tgammaf(float)
 
 define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3973,6 +4996,18 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f753df32d9ebc..f6f2e39594dd8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(acos|asin|atan|atan2|cos|cosh|exp|log|sin|sinh|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|tanh|trunc)" --version 2
 
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
@@ -16,6 +19,19 @@ declare double @llvm.acos.f64(double)
 declare float @llvm.acos.f32(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.acos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -51,6 +67,19 @@ define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.acos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -89,6 +118,19 @@ declare double @llvm.asin.f64(double)
 declare float @llvm.asin.f32(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.asin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -124,6 +166,19 @@ define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.asin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -162,6 +217,19 @@ declare double @llvm.atan.f64(double)
 declare float @llvm.atan.f32(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -197,6 +265,19 @@ define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -235,6 +316,19 @@ declare double @llvm.atan2.f64(double, double)
 declare float @llvm.atan2.f32(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan2.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -270,6 +364,19 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan2.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -308,6 +415,18 @@ declare double @llvm.ceil.f64(double)
 declare float @llvm.ceil.f32(float)
 
 define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +462,18 @@ define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @ceil_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -381,6 +512,19 @@ declare double @llvm.copysign.f64(double, double)
 declare float @llvm.copysign.f32(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.copysign.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -416,6 +560,19 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.copysign.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -454,6 +611,19 @@ declare double @llvm.cos.f64(double)
 declare float @llvm.cos.f32(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -489,6 +659,19 @@ define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -527,6 +710,19 @@ declare double @llvm.cosh.f64(double)
 declare float @llvm.cosh.f32(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cosh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -562,6 +758,19 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cosh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -600,6 +809,19 @@ declare double @llvm.exp.f64(double)
 declare float @llvm.exp.f32(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -635,6 +857,19 @@ define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -673,6 +908,19 @@ declare double @llvm.exp10.f64(double)
 declare float @llvm.exp10.f32(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -708,6 +956,19 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -746,6 +1007,19 @@ declare double @llvm.exp2.f64(double)
 declare float @llvm.exp2.f32(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -781,6 +1055,19 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -819,6 +1106,18 @@ declare double @llvm.fabs.f64(double)
 declare float @llvm.fabs.f32(float)
 
 define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -854,6 +1153,18 @@ define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fabs_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -892,6 +1203,18 @@ declare double @llvm.floor.f64(double)
 declare float @llvm.floor.f32(float)
 
 define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -927,6 +1250,18 @@ define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @floor_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -965,6 +1300,18 @@ declare double @llvm.fma.f64(double, double, double)
 declare float @llvm.fma.f32(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1000,6 +1347,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1038,6 +1397,19 @@ declare double @llvm.log.f64(double)
 declare float @llvm.log.f32(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1073,6 +1445,19 @@ define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1111,6 +1496,19 @@ declare double @llvm.log10.f64(double)
 declare float @llvm.log10.f32(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1146,6 +1544,19 @@ define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1184,6 +1595,19 @@ declare double @llvm.log2.f64(double)
 declare float @llvm.log2.f32(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1219,6 +1643,19 @@ define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1257,6 +1694,18 @@ declare double @llvm.maxnum.f64(double, double)
 declare float @llvm.maxnum.f32(float, float)
 
 define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1292,6 +1741,18 @@ define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @maxnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1330,6 +1791,18 @@ declare double @llvm.minnum.f64(double, double)
 declare float @llvm.minnum.f32(float, float)
 
 define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1365,6 +1838,18 @@ define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @minnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1403,6 +1888,18 @@ declare double @llvm.nearbyint.f64(double)
 declare float @llvm.nearbyint.f32(float)
 
 define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1438,6 +1935,18 @@ define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @nearbyint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1476,6 +1985,19 @@ declare double @llvm.pow.f64(double, double)
 declare float @llvm.pow.f32(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1511,6 +2033,19 @@ define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1549,6 +2084,18 @@ declare double @llvm.rint.f64(double)
 declare float @llvm.rint.f32(float)
 
 define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1584,6 +2131,18 @@ define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @rint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1622,6 +2181,18 @@ declare double @llvm.round.f64(double)
 declare float @llvm.round.f32(float)
 
 define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1657,6 +2228,18 @@ define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @round_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1695,6 +2278,19 @@ declare double @llvm.sin.f64(double)
 declare float @llvm.sin.f32(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1730,6 +2326,19 @@ define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1768,6 +2377,19 @@ declare double @llvm.sinh.f64(double)
 declare float @llvm.sinh.f32(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sinh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1803,6 +2425,19 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sinh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1841,6 +2476,18 @@ declare double @llvm.sqrt.f64(double)
 declare float @llvm.sqrt.f32(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1876,6 +2523,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1914,6 +2573,19 @@ declare double @llvm.tan.f64(double)
 declare float @llvm.tan.f32(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1949,6 +2621,19 @@ define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1987,6 +2672,19 @@ declare double @llvm.tanh.f64(double)
 declare float @llvm.tanh.f32(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tanh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2022,6 +2720,19 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tanh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2060,6 +2771,18 @@ declare double @llvm.trunc.f64(double)
 declare float @llvm.trunc.f32(float)
 
 define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2095,6 +2818,18 @@ define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @trunc_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index a7a7b1af5953b..1d898fbaaed36 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -390,9 +390,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index d340985457168..e3f9540ff3df7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -1,56 +1,63 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -scalable-vectorization=on -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 target triple = "riscv64-unknown-linux-gnu"
 
-; Make sure we do not pick <vscale x 1 x i64> as VF for a loop with a
-; first-order recurrence.
 define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[WIDE_LOAD1]], i32 3
-; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i32 [[TMP14]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP15]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP_SRC]], align 8
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index b7c9612e57aec..79425ae3a67ec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -21,11 +21,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -38,8 +38,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP7]])
@@ -48,7 +48,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -69,11 +69,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -88,8 +88,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
@@ -98,7 +98,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -190,11 +190,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -209,16 +209,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP16]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP22]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP24]]
@@ -233,7 +233,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP30]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> [[TMP31]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -254,11 +254,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -275,16 +275,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
@@ -299,7 +299,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index c64f6df075a04..3e4d337c0706c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -22,9 +22,9 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; VLENUNK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
 ; VLENUNK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 88d9ed2ce201e..2f9ff20bf0f98 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -20,13 +20,13 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1001, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP6]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 3dc17e615048e..51a8b451dffd9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -137,8 +137,8 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    store i8 [[TMP40]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT1:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b026e68685812..ba4c4b6d58add 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -38,10 +38,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -147,10 +147,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -178,7 +178,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 24
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -447,10 +447,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -556,10 +556,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -587,7 +587,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 34
+; CHECK-NEXT:  LV: Loop cost is 26
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index f89a863d1e5f5..79590f5060ad4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -553,9 +553,9 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i64> [[TMP12]], splat (i64 1)
 ; STRIDED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
 ; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 70c04ded5cf57..827612cfe36d5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -325,9 +325,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -432,9 +432,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -996,11 +996,11 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -1127,11 +1127,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -1233,11 +1233,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 427123cfca6d4..cd246053bcb30 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -29,17 +29,17 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 7d9ed7d6215c5..05a495d51c458 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -110,8 +110,8 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -214,8 +214,8 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 38b58fbfd1021..53fd2ed43972c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -39,8 +39,8 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -91,8 +91,8 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nusw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nusw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -143,8 +143,8 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nuw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -196,8 +196,8 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -248,8 +248,8 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -305,8 +305,8 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -356,8 +356,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -409,8 +409,8 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -518,8 +518,8 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -570,8 +570,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -698,8 +698,8 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -778,8 +778,8 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 7aeb32afe43be..0a85548f8750b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -411,9 +411,9 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 4fbee321b6a48..7f2544ddf149d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -519,8 +519,8 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -640,8 +640,8 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 6480c0ab1099d..02d48cbda1aab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -71,8 +71,8 @@
 ; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
 
 ; AVX: [[ForInc]]:
-; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: br i1 true, label %middle.block, label %vector.body
 
 @arr2 = external global [8 x i32], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 4038ace617c17..99650592d2dea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -131,7 +131,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
-; CHECK:       vector.body28:
+; CHECK:       vector.body30:
 ; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
@@ -153,18 +153,18 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK35:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block35:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK37:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block37:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
-; CHECK:       vec.epilog.iter.check42:
+; CHECK:       vec.epilog.iter.check44:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH40]], label [[VEC_EPILOG_PH42]]
-; CHECK:       vec.epilog.ph41:
+; CHECK:       vec.epilog.ph43:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
@@ -183,7 +183,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY49:%.*]]
-; CHECK:       vec.epilog.vector.body49:
+; CHECK:       vec.epilog.vector.body57:
 ; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
@@ -206,10 +206,10 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block62:
+; CHECK:       vec.epilog.middle.block64:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
-; CHECK:       vec.epilog.scalar.ph40:
+; CHECK:       vec.epilog.scalar.ph42:
 ; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 1365e9f73d854..6e62ff842c6d1 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -245,8 +245,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT6]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
new file mode 100644
index 0000000000000..d98cd45cb634e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/144212.
+define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %for.start, ptr %dst) {
+; CHECK-LABEL: define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(
+; CHECK-SAME: i8 [[FOR_START:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[FOR_START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLAT]], <4 x i8> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -7, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ], [ 1, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_NEXT]] = and i8 [[FOR_START]], -1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i8 [[FOR]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[FOR_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %for = phi i8 [ %for.start, %entry ], [ %for.next, %loop ]
+  %for.next = and i8 %for.start, -1
+  %iv.next = add i32 %iv, 1
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %iv
+  store i8 %for, ptr %gep.dst
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i8 %for.next
+}
+
+; %vec.dead will be marked as dead instruction in the vector loop and no recipe
+; will be created for it. Make sure a valid sink target is used.
+define i32 @sink_after_dead_inst(ptr %A.ptr) {
+; CHECK-LABEL: define i32 @sink_after_dead_inst(
+; CHECK-SAME: ptr [[A_PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i16> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; CHECK-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; CHECK-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
+  %cmp = icmp eq i32 %for, 15
+  %C = icmp eq i1 %cmp, true
+  %vec.dead = and i1 %C, 1
+  %iv.next = add i16 %iv, 1
+  %B1 = or i16 %iv.next, %iv.next
+  %B3 = and i1 %cmp, %C
+  %for.prev = zext i16 %B1 to i32
+
+  %ext = zext i1 %B3 to i32
+  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
+  store i32 0, ptr %A.gep
+  br i1 %vec.dead, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+; Dead instructions, like the exit condition are not part of the actual VPlan
+; and do not need to be sunk. PR44634.
+define void @sink_dead_inst(ptr %a) {
+; CHECK-LABEL: define void @sink_dead_inst(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[TMP4]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], splat (i16 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP9]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT1]], %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], %[[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; CHECK-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %cmp = icmp eq i32 %rec.2, 15
+  %iv.next = add i16 %iv, 1
+  %rec.2.prev = zext i16 %iv.next to i32
+  %rec.1.prev = add i16 %iv.next, 5
+  %gep = getelementptr i16, ptr %a, i16 %iv
+  store i16 %use.rec.1, ptr %gep
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
+; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
+; to be removed also.
+define void @unused_recurrence(ptr %a) {
+; CHECK-LABEL: define void @unused_recurrence(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %iv.next= add i16 %iv, 1
+  %rec.1.prev = add i16 %iv.next, 5
+  %cmp = icmp eq i16 %iv, 1000
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
index 98a942a501077..b20d59bd5760e 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-scalable-vf1.ll
@@ -8,17 +8,51 @@ define i64 @pr97452_scalable_vf1_for_live_out(ptr %src) {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for_live_out(
 ; CHECK-SAME: ptr [[SRC:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 1 x i64> [[TMP7]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP13]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -43,17 +77,51 @@ define void @pr97452_scalable_vf1_for_no_live_out(ptr %src, ptr noalias %dst) {
 ; CHECK-LABEL: define void @pr97452_scalable_vf1_for_no_live_out(
 ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 23, [[TMP0]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 23, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 23, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 0, i32 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 1 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <vscale x 1 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 1 x i64> @llvm.vector.splice.nxv1i64(<vscale x 1 x i64> [[VECTOR_RECUR]], <vscale x 1 x i64> [[WIDE_LOAD]], i32 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP11]], 1
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 1 x i64> [[WIDE_LOAD]], i32 [[TMP12]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 23, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -74,3 +142,11 @@ loop:
 exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 13dc53559d283..7684d274a75cf 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -870,7 +870,7 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR27246(ptr %dst) {
+define i32 @PR27246() {
 ; UNROLL-NO-IC-LABEL: @PR27246(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -882,8 +882,7 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
@@ -892,13 +891,10 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -906,21 +902,19 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-IC:       for.cond1:
-; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-IC-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       for.cond.cleanup3:
-; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -936,38 +930,33 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-VF:       for.cond1:
-; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-VF-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-VF-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-VF:       for.cond.cleanup3:
-; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -983,24 +972,18 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
@@ -1008,21 +991,19 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND2:%.*]]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; SINK-AFTER:       for.cond1:
-; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; SINK-AFTER-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; SINK-AFTER-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; SINK-AFTER-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SINK-AFTER:       for.cond.cleanup3:
-; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -1042,10 +1023,8 @@ for.cond.cleanup:
 for.cond1:
   %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
   %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %i.016
   %cmp2 = icmp sgt i32 %k.0, 1
   %dec = add nsw i32 %k.0, -1
-  store i32 %e.1, ptr %gep.dst
   br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
 
 for.cond.cleanup3:
@@ -1072,22 +1051,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1169,22 +1148,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 2
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 4
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1372,27 +1351,27 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT1:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
+; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADDX:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT1]], [[SCALAR_PH]] ], [ [[ADDX1:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[VAL_PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[VAL_PHI]] to i64
-; UNROLL-NO-VF-NEXT:    [[ADDX1]] = add i32 [[VAL_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[VAL_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
+; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI1]], [[FOR_BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -2644,7 +2623,7 @@ for.end:
   ret void
 }
 
-define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
+define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
@@ -2735,74 +2714,18 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
 ; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
-; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-IC:       pred.store.if:
-; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP65]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-IC:       pred.store.continue:
-; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; UNROLL-NO-IC:       pred.store.if21:
-; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP67]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; UNROLL-NO-IC:       pred.store.continue22:
-; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; UNROLL-NO-IC:       pred.store.if23:
-; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP53]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; UNROLL-NO-IC:       pred.store.continue24:
-; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; UNROLL-NO-IC:       pred.store.if25:
-; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP55]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; UNROLL-NO-IC:       pred.store.continue26:
-; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; UNROLL-NO-IC:       pred.store.if27:
-; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP57]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; UNROLL-NO-IC:       pred.store.continue28:
-; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
-; UNROLL-NO-IC:       pred.store.if29:
-; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP59]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
-; UNROLL-NO-IC:       pred.store.continue30:
-; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
-; UNROLL-NO-IC:       pred.store.if31:
-; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP61]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; UNROLL-NO-IC:       pred.store.continue32:
-; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF33:%.*]], label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.if33:
-; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP63]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.continue34:
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2827,7 +2750,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2859,25 +2781,15 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.if3:
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.continue4:
 ; UNROLL-NO-VF-NEXT:    [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-VF:       pred.store.if:
-; UNROLL-NO-VF-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-VF:       pred.store.continue:
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF5:%.*]], label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.if5:
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.continue6:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2901,7 +2813,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2956,44 +2867,16 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER:       pred.udiv.continue6:
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.if7:
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.continue8:
 ; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
-; SINK-AFTER-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; SINK-AFTER:       pred.store.if:
-; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
-; SINK-AFTER-NEXT:    store i32 [[TMP34]], ptr [[DST:%.*]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; SINK-AFTER:       pred.store.continue:
-; SINK-AFTER-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; SINK-AFTER-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; SINK-AFTER:       pred.store.if9:
-; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
-; SINK-AFTER-NEXT:    store i32 [[TMP28]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; SINK-AFTER:       pred.store.continue10:
-; SINK-AFTER-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; SINK-AFTER-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; SINK-AFTER:       pred.store.if11:
-; SINK-AFTER-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
-; SINK-AFTER-NEXT:    store i32 [[TMP30]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; SINK-AFTER:       pred.store.continue12:
-; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.if13:
-; SINK-AFTER-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP32]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.continue14:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -3016,7 +2899,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; SINK-AFTER-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -3034,7 +2916,6 @@ bb:
   %var6 = add i32 %var5, %var4
   %var7 = udiv i32 219220132, %var3
   %var8 = add nsw i32 %var3, -1
-  store i32 %var4, ptr %dst
   %var9 = icmp slt i32 %var3, 2
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
@@ -3524,12 +3405,12 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP5]], [[TMP5]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index a622193290c8f..e27734755dfb2 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -59,8 +59,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -203,8 +203,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -276,8 +276,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -350,8 +350,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index d973e451d887d..a4f2b077cb066 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -282,3 +282,141 @@ for.cond:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.cond
   ret void
 }
+
+; Test that when WidenPointerInductionRecipes are ordered before other
+; WidenIntOrFpInductionRecipes that their PHIs are emitted in the right place.
+define void @outside_lattice(ptr noalias %p, ptr noalias %q, i32 %n) {
+; DEFAULT-LABEL: @outside_lattice(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; DEFAULT-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; DEFAULT-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; DEFAULT-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; DEFAULT-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; DEFAULT-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; DEFAULT-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; DEFAULT-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       for.end:
+; DEFAULT-NEXT:    ret void
+;
+; STRIDED-LABEL: @outside_lattice(
+; STRIDED-NEXT:  entry:
+; STRIDED-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; STRIDED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; STRIDED:       vector.scevcheck:
+; STRIDED-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; STRIDED-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; STRIDED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; STRIDED-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; STRIDED:       vector.ph:
+; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; STRIDED-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; STRIDED:       vector.body:
+; STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; STRIDED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; STRIDED-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; STRIDED:       middle.block:
+; STRIDED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; STRIDED:       scalar.ph:
+; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    br label [[FOR_BODY:%.*]]
+; STRIDED:       for.body:
+; STRIDED-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; STRIDED-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; STRIDED-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; STRIDED-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; STRIDED-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; STRIDED:       for.end:
+; STRIDED-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv.ptr = phi ptr [ %p, %entry ], [ %iv.ptr.next, %for.body ]
+  %iv.int = phi i32 [ 0, %entry ], [ %iv.int.next, %for.body ]
+
+  %p.gep = getelementptr inbounds ptr, ptr %p, i32 %iv.int
+  store ptr %iv.ptr, ptr %p.gep
+
+  %q.gep = getelementptr inbounds i32, ptr %q, i32 %iv.int
+  store i32 %iv.int, ptr %q.gep
+
+  %iv.int.next = add i32 %iv.int, 1
+  %iv.ptr.next = getelementptr inbounds i32, ptr %iv.ptr, i32 1
+
+  %done = icmp ult i32 %iv.int.next, %n
+  br i1 %done, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index d497f0c22dbbc..fbe3a7a470e86 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -116,7 +116,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -125,12 +125,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -139,12 +139,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -153,7 +153,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
@@ -321,7 +321,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -330,12 +330,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -344,12 +344,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -358,7 +358,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> splat (i32 1)
@@ -436,7 +436,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
@@ -445,12 +445,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP22]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -459,12 +459,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP32]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP24]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP33]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
@@ -473,7 +473,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP38]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP48]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP49]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index a70d8f72c8a33..bb84dbf8ed236 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -658,9 +658,9 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP7]]
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
@@ -707,17 +707,17 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index f136b0e2e0b31..10f96284c0184 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -23,12 +23,12 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
@@ -103,12 +103,12 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
index 2a48e0a5e5310..15db687ba64ff 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -19,17 +19,17 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 5ff43dcf42bcf..ec1e8fa1e1b33 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -918,8 +918,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 130db548ca8cb..fe533672f2ca2 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -291,8 +291,8 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
 ; CHECK-NEXT:    store i32 [[TMP34]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_CONTINUE12]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 59277186195fc..7654bc9a141e0 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -65,3 +65,68 @@ loop.latch:
 exit:
   ret void
 }
+
+; Check that VPWidenIntOrFPInductionRecipe is expanded into smaller recipes in
+; the final VPlan.
+define void @iv_expand(ptr %p, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'iv_expand'
+; CHECK:      VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION  ir<0>, ir<1>, vp<%0>
+; CHECK-NEXT:     vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:     CLONE ir<%q> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN ir<%x> = load vp<%5>
+; CHECK-NEXT:     WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN store vp<%6>, ir<%y>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK:      VPlan 'Final VPlan for VF={8},UF={1}'
+; CHECK:      ir-bb<vector.ph>:
+; CHECK-NEXT:     IR   %n.mod.vf = urem i64 %n, 8
+; CHECK-NEXT:     IR   %n.vec = sub i64 %n, %n.mod.vf
+; CHECK-NEXT:     EMIT vp<[[STEP_VECTOR:%.+]]> = step-vector
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_0:%.+]]> = broadcast ir<0>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_1:%.+]]> = broadcast ir<1>
+; CHECK-NEXT:     EMIT vp<[[MUL:%.+]]> = mul vp<[[STEP_VECTOR]]>, vp<[[BROADCAST_1]]>
+; CHECK-NEXT:     EMIT vp<[[INDUCTION:%.+]]> = add vp<[[BROADCAST_0]]>, vp<[[MUL]]>
+; CHECK-NEXT:     EMIT vp<[[TRUNC:%.+]]> = trunc ir<8> to i64
+; CHECK-NEXT:     EMIT vp<[[INC:%.+]]> = mul ir<1>, vp<[[TRUNC]]>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast vp<[[INC]]>
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<[[SCALAR_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-PHI ir<%iv> = phi [ vp<[[INDUCTION]]>, ir-bb<vector.ph> ], [ vp<%vec.ind.next>, vector.body ]
+; CHECK-NEXT:   CLONE ir<%q> = getelementptr ir<%p>, vp<[[SCALAR_PHI]]>
+; CHECK-NEXT:   vp<[[VEC_PTR_1:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN ir<%x> = load vp<[[VEC_PTR_1]]>
+; CHECK-NEXT:   WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:   vp<[[VEC_PTR_2:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR_2]]>, ir<%y>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<[[SCALAR_PHI]]>, ir<8>
+; CHECK-NEXT:   EMIT vp<%vec.ind.next> = add ir<%iv>, vp<[[BROADCAST_INC]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<%n.vec>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %q = getelementptr i64, ptr %p, i64 %iv
+  %x = load i64, ptr %q
+  %y = add i64 %x, %iv
+  store i64 %y, ptr %q
+  %iv.next = add i64 %iv, 1
+  %done = icmp eq i64 %iv.next, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
new file mode 100644
index 0000000000000..459d45032b0c4
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
@@ -0,0 +1,5 @@
+---
+CfiFunctionDefs:
+  - f1
+  - f2
+...
diff --git a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
index 51a2a59365434..34e740771fe2b 100644
--- a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
+++ b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests | FileCheck %s
 
 ; CHECK: @badfileops = internal global %struct.f { ptr @bad_f, ptr @bad_f }
-; CHECK: @bad_f = internal alias void (), ptr @.cfi.jumptable
+; CHECK: @bad_f = internal alias [8 x i8], ptr @.cfi.jumptable
 ; CHECK: define internal void @bad_f.cfi() !type !0 {
 ; CHECK-NEXT:  ret void
 
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
index 7dda7f6df10c3..7eede8b7322f8 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: x86-registered-target
 ; RUN: opt -S -passes=lowertypetests %s | FileCheck %s
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | FileCheck %s
 
 ;; This is a check to assert we don't crash with:
 ;;
@@ -7,6 +8,7 @@
 ;;
 ;; So this just needs to exit normally.
 ; RUN: opt -S -passes=lowertypetests %s | llc -asm-verbose=false
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | llc -asm-verbose=false
 
 target datalayout = "e-p:64:64"
 target triple = "x86_64-pc-windows-msvc"
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
index 0c5324ee96c93..6b821186b0ad7 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
@@ -12,7 +12,7 @@ RUN: opt test1.bc -passes=lowertypetests -lowertypetests-read-summary=in.yaml \
 RUN:   -lowertypetests-summary-action=export -lowertypetests-write-summary=exported.yaml \
 RUN:   -S -o - | FileCheck %s --check-prefix=REGULAR
 REGULAR: @__typeid__ZTSFvvE_global_addr = hidden alias i8, ptr @.cfi.jumptable
-REGULAR: @f = alias void (), ptr @.cfi.jumptable
+REGULAR: @f = alias [8 x i8], ptr @.cfi.jumptable
 REGULAR: define private void @.cfi.jumptable()
 
 ;; CHECK that @llvm.type.test() is lowered to an actual check.
diff --git a/llvm/test/Transforms/LowerTypeTests/export-alias.ll b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
index 255e6b6ca4d17..45b4db63def18 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/exported-funcs.yaml | FileCheck %s
 ;
-; CHECK: @alias1 = weak alias void (), ptr @external_addrtaken
-; CHECK: @alias2 = hidden alias void (), ptr @external_addrtaken
+; CHECK: @alias1 = weak alias [8 x i8], ptr @external_addrtaken
+; CHECK: @alias2 = hidden alias [8 x i8], ptr @external_addrtaken
 ; CHECK-NOT: @alias3 = alias
 ; CHECK-NOT: @not_present
 
diff --git a/llvm/test/Transforms/LowerTypeTests/export-icall.ll b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
index 47156deb57de7..f8adb2d69910f 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-icall.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
@@ -40,15 +40,15 @@ define void @f3(i32 %x) !type !8 {
 ; CHECK-DAG: @__typeid_typeid1_align = hidden alias i8, inttoptr (i64 3 to ptr)
 ; CHECK-DAG: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 4 to ptr)
 
-; CHECK-DAG: @h                    = alias void (i8), ptr [[JT1]]
-; CHECK-DAG: @f                    = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @f2                   = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external.cfi_jt      = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external_weak.cfi_jt = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @h                    = alias [8 x i8], ptr [[JT1]]
+; CHECK-DAG: @f                    = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @f2                   = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external.cfi_jt      = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external_weak.cfi_jt = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
 
 ; CHECK-DAG: @__typeid_typeid2_global_addr = hidden alias i8, ptr [[JT2:.*]]
 
-; CHECK-DAG: @g                    = alias void (), ptr [[JT2]]
+; CHECK-DAG: @g                    = alias [8 x i8], ptr [[JT2]]
 
 ; CHECK-DAG: define hidden void @h.cfi(i8 {{.*}}) !type !{{.*}}
 ; CHECK-DAG: declare !type !{{.*}} void @external()
diff --git a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
index d7ba3a6814194..ae676df6e9f31 100644
--- a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
@@ -5,8 +5,8 @@
 
 target datalayout = "e-p:64:64"
 
-; X64: @g = alias void (), ptr @[[JT1:.*]]
-; X64: @f = alias void (), ptr @[[JT0:.*]]
+; X64: @g = alias [8 x i8], ptr @[[JT1:.*]]
+; X64: @f = alias [8 x i8], ptr @[[JT0:.*]]
 
 ; WASM32: private constant [0 x i8] zeroinitializer
 @0 = private unnamed_addr constant [2 x ptr] [ptr @f, ptr @g], align 16
diff --git a/llvm/test/Transforms/LowerTypeTests/function.ll b/llvm/test/Transforms/LowerTypeTests/function.ll
index 5b0852c82ea68..ab3cfb6acccf8 100644
--- a/llvm/test/Transforms/LowerTypeTests/function.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function.ll
@@ -28,14 +28,13 @@ target datalayout = "e-p:64:64"
 ; NATIVE: private constant [0 x i8] zeroinitializer
 ; WASM32: private constant [0 x i8] zeroinitializer
 
-; NATIVE: @f = alias void (), ptr @[[JT:.*]]
-
-; X86: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; ARM: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMB: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMBV6M: @g = internal alias void (), getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
-; RISCV: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; LOONGARCH64: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT4: @f = alias [4 x i8], ptr @[[JT:.*]]
+; JT8: @f = alias [8 x i8], ptr @[[JT:.*]]
+; JT16: @f = alias [16 x i8], ptr @[[JT:.*]]
+
+; JT4: @g = internal alias [4 x i8], getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT8: @g = internal alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT16: @g = internal alias [16 x i8], getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
 
 ; NATIVE: define hidden void @f.cfi()
 ; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]]
diff --git a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
index f67e0b1711652..8cb41398e8f53 100644
--- a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
+++ b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux"
 ; CHECK: @0 = private constant { i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 2 }
 ; CHECK: @g1 = alias i32, ptr @0
 ; CHECK: @g2 = alias i32, getelementptr inbounds ({ i32, [0 x i8], i32 }, ptr @0, i32 0, i32 2)
-; CHECK: @f1 = alias void (), ptr @.cfi.jumptable
-; CHECK: @f2 = alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
+; CHECK: @f1 = alias [8 x i8], ptr @.cfi.jumptable
+; CHECK: @f2 = alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
 
 @g1 = constant i32 1
 @g2 = constant i32 2
diff --git a/llvm/test/Transforms/LowerTypeTests/pr37625.ll b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
index 639cc3fa32bcb..cf52cdf0759a3 100644
--- a/llvm/test/Transforms/LowerTypeTests/pr37625.ll
+++ b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
@@ -11,4 +11,4 @@ declare !type !2 extern_weak void @external_addrtaken(i8)
 !1 = !{!"external_addrtaken", i8 0, !2}
 !2 = !{i64 0, !"typeid1"}
 
-; CHECK-DAG: @external_addrtaken = alias void (i8), ptr @.cfi.jumptable
+; CHECK-DAG: @external_addrtaken = alias [8 x i8], ptr @.cfi.jumptable
diff --git a/llvm/test/Transforms/LowerTypeTests/section.ll b/llvm/test/Transforms/LowerTypeTests/section.ll
index d0d3c212c826e..bd91389c60ef0 100644
--- a/llvm/test/Transforms/LowerTypeTests/section.ll
+++ b/llvm/test/Transforms/LowerTypeTests/section.ll
@@ -5,7 +5,7 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: @f = alias void (), ptr @[[JT:.*]]
+; CHECK: @f = alias [8 x i8], ptr @[[JT:.*]]
 ; CHECK: define hidden void @f.cfi() section "xxx"
 
 define void @f() section "xxx" !type !0 {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
new file mode 100644
index 0000000000000..ad4452431a483
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -0,0 +1,720 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=bdver2    | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX_FMA3
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; This test checks the vectorisation of FMUL+ADDSUB/FMADDSUB patterns, including cases with undef elements.
+
+; Ideally, this should reach the backend with 1 fmul, 1 fsub, 1 fadd, and 1 shuffle.
+; That may require some coordination between VectorCombine, SLP, and other passes.
+
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_ps128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x float> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %A = fmul <4 x float> %C, %D
+  %A0 = extractelement <4 x float> %A, i32 0
+  %B0 = extractelement <4 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <4 x float> %A, i32 2
+  %B2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A1 = extractelement <4 x float> %A, i32 1
+  %B1 = extractelement <4 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <4 x float> %A, i32 3
+  %B3 = extractelement <4 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+  ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_pd128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %A = fmul <2 x double> %C, %D
+  %A0 = extractelement <2 x double> %A, i32 0
+  %B0 = extractelement <2 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A1 = extractelement <2 x double> %A, i32 1
+  %B1 = extractelement <2 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; SSE2-LABEL: @buildvector_mul_addsub_ps256(
+; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP4]]
+;
+; SSE4-LABEL: @buildvector_mul_addsub_ps256(
+; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP2]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_ps256(
+; AVX-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %A = fmul <8 x float> %C, %D
+  %A0 = extractelement <8 x float> %A, i32 0
+  %B0 = extractelement <8 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <8 x float> %A, i32 2
+  %B2 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A4 = extractelement <8 x float> %A, i32 4
+  %B4 = extractelement <8 x float> %B, i32 4
+  %sub4 = fsub float %A4, %B4
+  %A6 = extractelement <8 x float> %A, i32 6
+  %B6 = extractelement <8 x float> %B, i32 6
+  %sub6 = fsub float %A6, %B6
+  %A1 = extractelement <8 x float> %A, i32 1
+  %B1 = extractelement <8 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <8 x float> %A, i32 3
+  %B3 = extractelement <8 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %A5 = extractelement <8 x float> %A, i32 5
+  %B5 = extractelement <8 x float> %B, i32 5
+  %add5 = fadd float %A5, %B5
+  %A7 = extractelement <8 x float> %A, i32 7
+  %B7 = extractelement <8 x float> %B, i32 7
+  %add7 = fadd float %A7, %B7
+  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+  ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_addsub_pd256(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %A = fmul <4 x double> %C, %D
+  %A0 = extractelement <4 x double> %A, i32 0
+  %B0 = extractelement <4 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A2 = extractelement <4 x double> %A, i32 2
+  %B2 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %A2, %B2
+  %A1 = extractelement <4 x double> %A, i32 1
+  %B1 = extractelement <4 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %A3 = extractelement <4 x double> %A, i32 3
+  %B3 = extractelement <4 x double> %B, i32 3
+  %add3 = fadd double %A3, %B3
+  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+  ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_ps512(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_addsub_ps512(
+; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fsub float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fsub float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fsub float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fsub float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fsub float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fsub float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fsub float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fadd float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fadd float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fadd float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fadd float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fadd float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fadd float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fadd float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fadd float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  ; element 12 is undef
+  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_addsub_pd512(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; SSE-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fadd double [[A7]], [[B7]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
+; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_addsub_pd512(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX512-LABEL: @buildvector_mul_addsub_pd512(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP7:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fsub double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fsub double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fsub double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fadd double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fadd double %A3, %B3
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fadd double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_ps128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %A = fmul <4 x float> %C, %D
+  %A0 = extractelement <4 x float> %A, i32 0
+  %B0 = extractelement <4 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <4 x float> %A, i32 2
+  %B2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A1 = extractelement <4 x float> %A, i32 1
+  %B1 = extractelement <4 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <4 x float> %A, i32 3
+  %B3 = extractelement <4 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+  ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_pd128(
+; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %A = fmul <2 x double> %C, %D
+  %A0 = extractelement <2 x double> %A, i32 0
+  %B0 = extractelement <2 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A1 = extractelement <2 x double> %A, i32 1
+  %B1 = extractelement <2 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; SSE2-LABEL: @buildvector_mul_subadd_ps256(
+; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE2-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP4]]
+;
+; SSE4-LABEL: @buildvector_mul_subadd_ps256(
+; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE4-NEXT:    ret <8 x float> [[TMP6]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA4-NEXT:    ret <8 x float> [[TMP6]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_subadd_ps256(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x float> [[TMP2]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_ps256(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %A = fmul <8 x float> %C, %D
+  %A0 = extractelement <8 x float> %A, i32 0
+  %B0 = extractelement <8 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <8 x float> %A, i32 2
+  %B2 = extractelement <8 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A4 = extractelement <8 x float> %A, i32 4
+  %B4 = extractelement <8 x float> %B, i32 4
+  %sub4 = fadd float %A4, %B4
+  %A6 = extractelement <8 x float> %A, i32 6
+  %B6 = extractelement <8 x float> %B, i32 6
+  %sub6 = fadd float %A6, %B6
+  %A1 = extractelement <8 x float> %A, i32 1
+  %B1 = extractelement <8 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <8 x float> %A, i32 3
+  %B3 = extractelement <8 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %A5 = extractelement <8 x float> %A, i32 5
+  %B5 = extractelement <8 x float> %B, i32 5
+  %add5 = fsub float %A5, %B5
+  %A7 = extractelement <8 x float> %A, i32 7
+  %B7 = extractelement <8 x float> %B, i32 7
+  %add7 = fsub float %A7, %B7
+  %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+  ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; CHECK-LABEL: @buildvector_mul_subadd_pd256(
+; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %A = fmul <4 x double> %C, %D
+  %A0 = extractelement <4 x double> %A, i32 0
+  %B0 = extractelement <4 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A2 = extractelement <4 x double> %A, i32 2
+  %B2 = extractelement <4 x double> %B, i32 2
+  %sub2 = fadd double %A2, %B2
+  %A1 = extractelement <4 x double> %A, i32 1
+  %B1 = extractelement <4 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %A3 = extractelement <4 x double> %A, i32 3
+  %B3 = extractelement <4 x double> %B, i32 3
+  %add3 = fsub double %A3, %B3
+  %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+  ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_ps512(
+; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <12 x float> [[TMP0]], [[TMP1]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <12 x float> [[TMP2]], <12 x float> [[TMP3]], <12 x i32> <i32 0, i32 13, i32 2, i32 15, i32 4, i32 5, i32 18, i32 7, i32 20, i32 9, i32 22, i32 23>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <12 x float> [[TMP4]], <12 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 12, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 13, i32 11, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP8]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[VECINSERT161:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE-NEXT:    ret <16 x float> [[VECINSERT161]]
+;
+; AVX-LABEL: @buildvector_mul_subadd_ps512(
+; AVX-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[B:%.*]], <16 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP0]], [[TMP1]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 5, i32 14, i32 7>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <4 x i32> <i32 9, i32 10, i32 11, i32 13>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <16 x float> [[B]], <16 x float> poison, <2 x i32> <i32 14, i32 15>
+; AVX-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP12:%.*]] = fsub <2 x float> [[TMP9]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> <float undef, float undef, float poison, float poison, float poison, float poison, float poison, float poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <16 x i32> <i32 0, i32 5, i32 2, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT141:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 12, i32 20, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP12]], <16 x i32> <i32 0, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[VECINSERT162:%.*]] = shufflevector <16 x float> [[VECINSERT141]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX-NEXT:    ret <16 x float> [[VECINSERT162]]
+;
+  %A = fmul <16 x float> %C, %D
+  %A0 = extractelement <16 x float> %A, i32 0
+  %B0 = extractelement <16 x float> %B, i32 0
+  %sub0 = fadd float %A0, %B0
+  %A2 = extractelement <16 x float> %A, i32 2
+  %B2 = extractelement <16 x float> %B, i32 2
+  %sub2 = fadd float %A2, %B2
+  %A4 = extractelement <16 x float> %A, i32 4
+  %B4 = extractelement <16 x float> %B, i32 4
+  %sub4 = fadd float %A4, %B4
+  %A6 = extractelement <16 x float> %A, i32 6
+  %B6 = extractelement <16 x float> %B, i32 6
+  %sub6 = fadd float %A6, %B6
+  %A8 = extractelement <16 x float> %A, i32 8
+  %B8 = extractelement <16 x float> %B, i32 8
+  %sub8 = fadd float %A8, %B8
+  %A10 = extractelement <16 x float> %A, i32 10
+  %B10 = extractelement <16 x float> %B, i32 10
+  %sub10 = fadd float %A10, %B10
+  %A12 = extractelement <16 x float> %A, i32 12
+  %B12 = extractelement <16 x float> %B, i32 12
+  %sub12 = fadd float %A12, %B12
+  %A14 = extractelement <16 x float> %A, i32 14
+  %B14 = extractelement <16 x float> %B, i32 14
+  %sub14 = fadd float %A14, %B14
+  %A1 = extractelement <16 x float> %A, i32 1
+  %B1 = extractelement <16 x float> %B, i32 1
+  %add1 = fsub float %A1, %B1
+  %A3 = extractelement <16 x float> %A, i32 3
+  %B3 = extractelement <16 x float> %B, i32 3
+  %add3 = fsub float %A3, %B3
+  %A5 = extractelement <16 x float> %A, i32 5
+  %B5 = extractelement <16 x float> %B, i32 5
+  %add5 = fsub float %A5, %B5
+  %A7 = extractelement <16 x float> %A, i32 7
+  %B7 = extractelement <16 x float> %B, i32 7
+  %add7 = fsub float %A7, %B7
+  %A9 = extractelement <16 x float> %A, i32 9
+  %B9 = extractelement <16 x float> %B, i32 9
+  %add9 = fsub float %A9, %B9
+  %A11 = extractelement <16 x float> %A, i32 11
+  %B11 = extractelement <16 x float> %B, i32 11
+  %add11 = fsub float %A11, %B11
+  %A13 = extractelement <16 x float> %A, i32 13
+  %B13 = extractelement <16 x float> %B, i32 13
+  %add13 = fsub float %A13, %B13
+  %A15 = extractelement <16 x float> %A, i32 15
+  %B15 = extractelement <16 x float> %B, i32 15
+  %add15 = fsub float %A15, %B15
+  %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+  ; element 12 is undef
+  %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+  ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; SSE-LABEL: @buildvector_mul_subadd_pd512(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[TMP0]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP2:%.*]] = fsub <8 x double> [[A]], [[B]]
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <6 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> [[TMP5]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7>
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; SSE-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <6 x double> [[TMP6]], <6 x double> <double undef, double poison, double poison, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 poison>
+; SSE-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP7]], double [[ADD7]], i64 7
+; SSE-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA4-LABEL: @buildvector_mul_subadd_pd512(
+; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA4-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7
+; AVX_FMA4-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7
+; AVX_FMA4-NEXT:    [[ADD7:%.*]] = fsub double [[A7]], [[B7]]
+; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA4-NEXT:    [[VECINSERT8:%.*]] = insertelement <8 x double> [[VECINSERT71]], double [[ADD7]], i64 7
+; AVX_FMA4-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX_FMA3-LABEL: @buildvector_mul_subadd_pd512(
+; AVX_FMA3-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX_FMA3-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX_FMA3-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX_FMA3-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX_FMA3-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA3-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX_FMA3-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX_FMA3-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+; AVX512-LABEL: @buildvector_mul_subadd_pd512(
+; AVX512-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
+; AVX512-NEXT:    [[TMP0:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP0]], [[TMP1]]
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; AVX512-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double undef, double poison, double poison, double poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX512-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[TMP7]], <8 x double> poison, <8 x i32> <i32 4, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[VECINSERT71:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 10, i32 poison>
+; AVX512-NEXT:    [[VECINSERT8:%.*]] = shufflevector <8 x double> [[VECINSERT71]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX512-NEXT:    ret <8 x double> [[VECINSERT8]]
+;
+  %A = fmul <8 x double> %C, %D
+  %A0 = extractelement <8 x double> %A, i32 0
+  %B0 = extractelement <8 x double> %B, i32 0
+  %sub0 = fadd double %A0, %B0
+  %A2 = extractelement <8 x double> %A, i32 2
+  %B2 = extractelement <8 x double> %B, i32 2
+  %sub2 = fadd double %A2, %B2
+  %A4 = extractelement <8 x double> %A, i32 4
+  %B4 = extractelement <8 x double> %B, i32 4
+  %sub4 = fadd double %A4, %B4
+  %A6 = extractelement <8 x double> %A, i32 6
+  %B6 = extractelement <8 x double> %B, i32 6
+  %sub6 = fadd double %A6, %B6
+  %A1 = extractelement <8 x double> %A, i32 1
+  %B1 = extractelement <8 x double> %B, i32 1
+  %add1 = fsub double %A1, %B1
+  %A3 = extractelement <8 x double> %A, i32 3
+  %B3 = extractelement <8 x double> %B, i32 3
+  %add3 = fsub double %A3, %B3
+  %A7 = extractelement <8 x double> %A, i32 7
+  %B7 = extractelement <8 x double> %B, i32 7
+  %add7 = fsub double %A7, %B7
+  %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+  ; element 5 is undef
+  %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+  ret <8 x double> %vecinsert8
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 28b48bd3ce6d9..9bfd92ef35a46 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -272,24 +272,21 @@ define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HADDD1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADDB:%.*]] = insertelement <16 x i16> [[HADD92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 0062527b678c9..13b4d7da97c9d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -272,24 +272,21 @@ define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HSUBD1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUBB:%.*]] = insertelement <16 x i16> [[HSUB92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
index ce9e47a03dee3..f3e89b60b8045 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
@@ -10,124 +10,84 @@ define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out,
 ; GCN-NEXT:  [[ENTRY:.*]]:
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
 ; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2
-; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3
 ; GCN-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
 ; GCN-NEXT:    [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4
-; GCN-NEXT:    [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5
 ; GCN-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
 ; GCN-NEXT:    [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6
-; GCN-NEXT:    [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7
 ; GCN-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
 ; GCN-NEXT:    [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8
-; GCN-NEXT:    [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9
 ; GCN-NEXT:    [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
 ; GCN-NEXT:    [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10
-; GCN-NEXT:    [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11
 ; GCN-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
 ; GCN-NEXT:    [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12
-; GCN-NEXT:    [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13
 ; GCN-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14
 ; GCN-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
-; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; GCN-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; GCN-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; GCN-NEXT:    [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; GCN-NEXT:    [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; GCN-NEXT:    [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
-; GCN-NEXT:    [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; GCN-NEXT:    br label %[[DO_BODY:.*]]
 ; GCN:       [[DO_BODY]]:
-; GCN-NEXT:    [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
-; GCN-NEXT:    [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1
-; GCN-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
-; GCN-NEXT:    [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1
-; GCN-NEXT:    [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
-; GCN-NEXT:    [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1
-; GCN-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
-; GCN-NEXT:    [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1
-; GCN-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
-; GCN-NEXT:    [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1
-; GCN-NEXT:    [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
-; GCN-NEXT:    [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1
-; GCN-NEXT:    [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
 ; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
-; GCN-NEXT:    [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0
-; GCN-NEXT:    [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0
-; GCN-NEXT:    [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0
-; GCN-NEXT:    [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0
-; GCN-NEXT:    [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0
-; GCN-NEXT:    [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0
 ; GCN-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GCN:       [[EXIT]]:
-; GCN-NEXT:    [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2
-; GCN-NEXT:    [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3
-; GCN-NEXT:    [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4
-; GCN-NEXT:    [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5
-; GCN-NEXT:    [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6
-; GCN-NEXT:    [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7
-; GCN-NEXT:    [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8
-; GCN-NEXT:    [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9
-; GCN-NEXT:    [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10
-; GCN-NEXT:    [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11
-; GCN-NEXT:    [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12
-; GCN-NEXT:    [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13
-; GCN-NEXT:    [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT]], align 32
+; GCN-NEXT:    store <16 x i16> [[TMP49]], ptr [[OUT1]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT2]], align 32
 ; GCN-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index c79fa9c84d1c3..f71fdbdee527b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -271,7 +271,9 @@ bb:
 }
 
 ; GCN-LABEL: @copysign_combine_v2f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -290,8 +292,6 @@ bb:
 
 ; FIXME: Should always vectorize
 ; GCN-LABEL: @copysign_combine_v4f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
-
 ; GFX8: call half @llvm.copysign.f16(
 ; GFX8: call half @llvm.copysign.f16(
 
@@ -327,8 +327,10 @@ bb:
 }
 
 ; GCN-LABEL: @canonicalize_combine_v4f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -358,3 +360,43 @@ bb:
   store half %tmp16, ptr addrspace(1) %tmp14, align 2
   ret void
 }
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @minimumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.minimumnum.v2f16
+; GFX9: call <2 x half> @llvm.minimumnum.v2f16
+define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @maximumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.maximumnum.v2f16
+; GFX9: call <2 x half> @llvm.maximumnum.v2f16
+define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
new file mode 100644
index 0000000000000..6006bf9cb262d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-ibm-linux -mcpu=z13 -slp-max-reg-size=256 -slp-vectorize-hor-store -slp-vectorize-non-power-of-2 < %s | FileCheck %s
+
+@c = external global [1 x [10 x i32]]
+@j.0 = external global i32
+
+define void @p() {
+; CHECK-LABEL: define void @p(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP0]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <7 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <7 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[OR_1_5_I_3:%.*]] = or i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[OR_1_5_I_3]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <7 x i32> [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <7 x i32> [[TMP4]], splat (i32 1)
+; CHECK-NEXT:    store <7 x i32> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    [[OR_1_5_I_5:%.*]] = or i32 [[TMP10]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[OR_1_6_I_5:%.*]] = or i32 [[OR_1_5_I_5]], [[TMP11]]
+; CHECK-NEXT:    store i32 [[OR_1_6_I_5]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx12.promoted.5.i = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %conv14.5.i = xor i32 %arrayidx12.promoted.5.i, 1
+  store i32 %conv14.5.i, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %arrayidx12.promoted.5.i.1 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %conv14.5.i.1 = xor i32 %arrayidx12.promoted.5.i.1, 1
+  store i32 %conv14.5.i.1, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %arrayidx12.promoted.5.i.2 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %conv14.5.i.2 = xor i32 %arrayidx12.promoted.5.i.2, 1
+  store i32 %conv14.5.i.2, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %arrayidx12.promoted.1.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %conv14.1.i.3 = xor i32 %arrayidx12.promoted.1.i.3, 1
+  store i32 %conv14.1.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %arrayidx12.promoted.5.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %conv14.5.i.3 = xor i32 %arrayidx12.promoted.5.i.3, 1
+  store i32 %conv14.5.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %arrayidx12.promoted.6.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  %conv14.6.i.3 = xor i32 %arrayidx12.promoted.6.i.3, 1
+  %or.1.5.i.3 = or i32 %arrayidx12.promoted.1.i.3, %arrayidx12.promoted.5.i.3
+  store i32 %conv14.6.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  store i32 %or.1.5.i.3, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %conv14.1.i.4 = xor i32 %arrayidx12.promoted.1.i.4, 1
+  store i32 %conv14.1.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %arrayidx12.promoted.5.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %conv14.5.i.4 = xor i32 %arrayidx12.promoted.5.i.4, 1
+  store i32 %conv14.5.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %arrayidx12.promoted.6.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %conv14.6.i.4 = xor i32 %arrayidx12.promoted.6.i.4, 1
+  store i32 %conv14.6.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %arrayidx12.promoted.1.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %conv14.1.i.5 = xor i32 %arrayidx12.promoted.1.i.5, 1
+  store i32 %conv14.1.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %arrayidx12.promoted.5.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %conv14.5.i.5 = xor i32 %arrayidx12.promoted.5.i.5, 1
+  store i32 %conv14.5.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %arrayidx12.promoted.6.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  %conv14.6.i.5 = xor i32 %arrayidx12.promoted.6.i.5, 1
+  %0 = or i32 %arrayidx12.promoted.6.i.4, %arrayidx12.promoted.1.i.5
+  %or.1.5.i.5 = or i32 %0, %arrayidx12.promoted.5.i.5
+  %or.1.6.i.5 = or i32 %or.1.5.i.5, %arrayidx12.promoted.6.i.5
+  store i32 %conv14.6.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  store i32 %or.1.6.i.5, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %conv14.1.i.6 = xor i32 %arrayidx12.promoted.1.i.6, 1
+  store i32 %conv14.1.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %arrayidx12.promoted.5.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %conv14.5.i.6 = xor i32 %arrayidx12.promoted.5.i.6, 1
+  store i32 %conv14.5.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %arrayidx12.promoted.6.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  %conv14.6.i.6 = xor i32 %arrayidx12.promoted.6.i.6, 1
+  store i32 %conv14.6.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 28bab3276c47d..6942df532ae29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -7,9 +7,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[I82:%.*]] = fsub double 0.000000e+00, poison
+; CHECK-NEXT:    [[I103:%.*]] = fsub double 0.000000e+00, [[I]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2
@@ -22,13 +21,11 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]])
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]]
 ; CHECK:       bb115:
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul double 0.000000e+00, [[I103]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index e3a6020a542fb..2cc2f28ccf6d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -25,7 +25,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -34,7 +33,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
@@ -42,17 +40,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], splat (i32 3)
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], splat (i32 3)
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/isCommutative.ll b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
new file mode 100644
index 0000000000000..704ac8295f55b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+define i16 @check_isCommutative_with_the_original_source() {
+; CHECK-LABEL: @check_isCommutative_with_the_original_source(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND3:%.*]] = select i1 true, i16 1, i16 0
+; CHECK-NEXT:    ret i16 [[COND3]]
+;
+entry:
+  %sub = sub i16 0, -1
+  %cmp = icmp eq i16 %sub, 1
+
+  %sub1 = sub i16 0, -1
+  %cmp2 = icmp eq i16 %sub1, 1
+  %cond3 = select i1 %cmp2, i16 1, i16 0
+
+  %sub5 = sub nsw i16 0, 0
+  %cmp6 = icmp eq i16 %sub5, 0
+  %cmp9 = icmp eq i16 %sub5, 0
+
+  %sub12 = sub nsw i16 0, 0
+  %cmp13 = icmp eq i16 %sub12, 0
+
+  %sub16 = sub nsw i16 0, 0
+  %cmp17 = icmp eq i16 %sub16, 0
+
+  %sub20 = sub nsw i16 0, 0
+  %cmp21 = icmp eq i16 %sub20, 0
+  %cmp24 = icmp eq i16 %sub20, 0
+
+  ret i16 %cond3
+}
+
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index a1f660d31668e..5459512239b64 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -1,15 +1,13 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SVML
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,AMDLIBM
 ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,MASSV
-; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=LIBMVEC-AARCH64
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-AARCH64
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ACCELERATE
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI
 ; RUN: opt -mtriple=riscv64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI_RISCV
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=ArmPL -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ARMPL
 
-; LIBMVEC-AARCH64-NOT: llvm.compiler.used
-
 ; COMMON-LABEL: @llvm.compiler.used = appending global
 ; SVML-SAME:        [6 x ptr] [
 ; SVML-SAME:          ptr @__svml_sin2,
@@ -35,6 +33,12 @@
 ; MASSV-SAME:         ptr @__log10f4
 ; ACCELERATE-SAME:  [1 x ptr] [
 ; ACCELERATE-SAME:    ptr @vlog10f
+; LIBMVEC-AARCH64-SAME: [5 x ptr] [
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN4v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_log10f
 ; LIBMVEC-X86-SAME: [2 x ptr] [
 ; LIBMVEC-X86-SAME:   ptr @_ZGVbN2v_sin,
 ; LIBMVEC-X86-SAME:   ptr @_ZGVdN4v_sin
@@ -100,6 +104,7 @@ define double @sin_f64(double %in) {
 ; AMDLIBM:            call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; MASSV:              call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; ACCELERATE:         call double @sin(double %{{.*}})
+; LIBMVEC-AARCH64:    call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; LIBMVEC-X86:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI_RISCV:  call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
@@ -158,6 +163,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; COMMON-LABEL:       @call_llvm.log10.f32(
 ; SVML:               call float @llvm.log10.f32(float %{{.*}})
 ; AMDLIBM:            call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
+; LIBMVEC-AARCH64:    call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; LIBMVEC-X86:        call float @llvm.log10.f32(float %{{.*}})
 ; MASSV:              call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; ACCELERATE:         call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
@@ -167,6 +173,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; No mapping of "llvm.log10.f32" to a vector function for SVML.
 ; SVML-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; AMDLIBM-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
+; LIBMVEC-AARCH64-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; LIBMVEC-X86-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
   %call = tail call float @llvm.log10.f32(float %in)
   ret float %call
@@ -196,8 +203,11 @@ declare float @llvm.log10.f32(float) #0
 ; MASSV: declare <2 x double> @__sind2(<2 x double>)
 ; MASSV: declare <4 x float> @__log10f4(<4 x float>)
 
-; LIBMVEC-AARCH64-NOT: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
-; LIBMVEC-AARCH64-NOT: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double>)
+; LIBMVEC-AARCH64: declare <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double>, <vscale x 2 x i1>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float>)
+; LIBMVEC-AARCH64: declare <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float>, <vscale x 4 x i1>)
 
 ; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
 ; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
@@ -272,6 +282,14 @@ attributes #0 = { nounwind readnone }
 ; ACCELERATE:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ACCELERATE-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(vlog10f)" }
 
+; LIBMVEC-AARCH64:      attributes #[[SIN]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_sin(_ZGVsMxv_sin)" }
+; LIBMVEC-AARCH64:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_llvm.log10.f32(_ZGVnN2v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
+
 ; LIBMVEC-X86:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; LIBMVEC-X86-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVbN2v_sin),
 ; LIBMVEC-X86-SAME:   _ZGV_LLVM_N4v_sin(_ZGVdN4v_sin)" }
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
index a454bf14c3353..77de0241daeab 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 %s -o - -filetype=obj \
+; RUN: llc -O0 %s -o - -filetype=obj -debugger-tune=gdb -accel-tables=Apple \
 ; RUN:   | llvm-dwarfdump -statistics - | FileCheck %s
 ; CHECK: "version": 9,
 
@@ -55,7 +55,7 @@
 ; CHECK:      "#bytes within functions": [[FUNCSIZE:[0-9]+]]
 ; CHECK:      "#bytes within inlined functions": [[INLINESIZE:[0-9]+]]
 ; CHECK:      "#bytes in __debug_loc": 35,
-; CHECK-NEXT: "#bytes in __debug_abbrev": 384,
+; CHECK-NEXT: "#bytes in __debug_abbrev": 375,
 ; CHECK-NEXT: "#bytes in __debug_info": 459,
 ; CHECK-NEXT: "#bytes in __debug_str": 231,
 ; CHECK-NEXT: "#bytes in __apple_names": 348,
diff --git a/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
new file mode 100644
index 0000000000000..fcf3b8f5463d4
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
@@ -0,0 +1,3 @@
+# RUN: llvm-exegesis -mtriple=riscv64-unknown-linux-gnu -mcpu=generic --benchmark-phase=assemble-measured-code -mode=inverse_throughput -opcode-name=InsnB 2>&1 | FileCheck %s
+
+CHECK: Unsupported opcode: No Sched Class
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index d90dce8c5c3fc..f6dc6eef3f0ff 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
index 2b2bd670613de..d6f6fe10d88c2 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -16,7 +16,11 @@
 # CHECK:       fileoff: 0
 
 # The YAML below is the following code
+# ```
+# static int foo = 12345;
+# int bar = 4567;
 # int main(int argc, char **argv) { return 0; }
+# ```
 # Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
 # Contents are removed, since they are not important for the test. We need a
 # small text segment (smaller than a page).
@@ -26,8 +30,8 @@ FileHeader:
   cputype:         0x100000C
   cpusubtype:      0x0
   filetype:        0x2
-  ncmds:           15
-  sizeofcmds:      696
+  ncmds:           18
+  sizeofcmds:      920
   flags:           0x200085
   reserved:        0x0
 LoadCommands:
@@ -69,7 +73,7 @@ LoadCommands:
       - sectname:        __unwind_info
         segname:         __TEXT
         addr:            0x100004020
-        size:            4152
+        size:            88
         offset:          0x4020
         align:           2
         reloff:          0x0
@@ -79,37 +83,61 @@ LoadCommands:
         reserved2:       0x0
         reserved3:       0x0
   - cmd:             LC_SEGMENT_64
-    cmdsize:         72
-    segname:         __LINKEDIT
+    cmdsize:         152
+    segname:         __DATA
     vmaddr:          4295000064
-    vmsize:          592
+    vmsize:          16384
     fileoff:         32768
-    filesize:        592
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x100008000
+        size:            4
+        offset:          0x8000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295016448
+    vmsize:          16384
+    fileoff:         49152
+    filesize:        768
     maxprot:         1
     initprot:        1
     nsects:          0
     flags:           0
   - cmd:             LC_DYLD_CHAINED_FIXUPS
     cmdsize:         16
-    dataoff:         32768
-    datasize:        48
+    dataoff:         49152
+    datasize:        56
   - cmd:             LC_DYLD_EXPORTS_TRIE
     cmdsize:         16
-    dataoff:         32816
-    datasize:        48
+    dataoff:         49208
+    datasize:        64
   - cmd:             LC_SYMTAB
     cmdsize:         24
-    symoff:          32872
-    nsyms:           2
-    stroff:          32904
-    strsize:         32
+    symoff:          49280
+    nsyms:           3
+    stroff:          49328
+    strsize:         40
   - cmd:             LC_DYSYMTAB
     cmdsize:         80
     ilocalsym:       0
     nlocalsym:       0
     iextdefsym:      0
-    nextdefsym:      2
-    iundefsym:       2
+    nextdefsym:      3
+    iundefsym:       3
     nundefsym:       0
     tocoff:          0
     ntoc:            0
@@ -123,12 +151,6 @@ LoadCommands:
     nextrel:         0
     locreloff:       0
     nlocrel:         0
-  - cmd:             LC_ENCRYPTION_INFO_64
-    cmdsize:         24
-    cryptoff:        16384
-    cryptsize:       16384
-    cryptid:         0
-    pad:             0
   - cmd:             LC_LOAD_DYLINKER
     cmdsize:         32
     name:            12
@@ -136,32 +158,50 @@ LoadCommands:
     ZeroPadBytes:    7
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+    uuid:            ADDA943C-657A-3A49-9580-168E17A40FFB
   - cmd:             LC_BUILD_VERSION
     cmdsize:         32
     platform:        1
     minos:           983040
-    sdk:             983552
+    sdk:             984320
     ntools:          1
     Tools:
-      - tool:            4
-        version:         1310720
+      - tool:            3
+        version:         76481537
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
   - cmd:             LC_MAIN
     cmdsize:         24
     entryoff:        16384
     stacksize:       0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
   - cmd:             LC_FUNCTION_STARTS
     cmdsize:         16
-    dataoff:         32864
+    dataoff:         49272
     datasize:        8
   - cmd:             LC_DATA_IN_CODE
     cmdsize:         16
-    dataoff:         32872
+    dataoff:         49280
     datasize:        0
   - cmd:             LC_CODE_SIGNATURE
     cmdsize:         16
-    dataoff:         32944
-    datasize:        416
+    dataoff:         49376
+    datasize:        544
 LinkEditData:
   ExportTrie:
     TerminalSize:    0
@@ -173,51 +213,67 @@ LinkEditData:
     ImportName:      ''
     Children:
       - TerminalSize:    0
-        NodeOffset:      5
+        NodeOffset:      25
         Name:            _
         Flags:           0x0
         Address:         0x0
         Other:           0x0
         ImportName:      ''
         Children:
+          - TerminalSize:    2
+            NodeOffset:      9
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
           - TerminalSize:    4
-            NodeOffset:      33
-            Name:            main
+            NodeOffset:      13
+            Name:            bar
             Flags:           0x0
-            Address:         0x4000
+            Address:         0x8000
             Other:           0x0
             ImportName:      ''
-          - TerminalSize:    2
-            NodeOffset:      39
-            Name:            _mh_execute_header
+          - TerminalSize:    4
+            NodeOffset:      19
+            Name:            main
             Flags:           0x0
-            Address:         0x0
+            Address:         0x4000
             Other:           0x0
             ImportName:      ''
   NameList:
     - n_strx:          2
       n_type:          0xF
       n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0xF
+      n_sect:          3
       n_desc:          0
-      n_value:         4294983680
-    - n_strx:          8
+      n_value:         4295000064
+    - n_strx:          27
       n_type:          0xF
       n_sect:          1
-      n_desc:          16
-      n_value:         4294967296
+      n_desc:          0
+      n_value:         4294983680
   StringTable:
     - ' '
-    - _main
     - __mh_execute_header
+    - _bar
+    - _main
+    - ''
+    - ''
+    - ''
     - ''
     - ''
     - ''
     - ''
   FunctionStarts:  [ 0x4000 ]
-  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
-                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x34, 0x0,
+                     0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
 ...
-
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 68d19514bedb2..fc5f82f288ae4 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -45,6 +45,8 @@ ExegesisTarget::getIgnoredOpcodeReasonOrNull(const LLVMState &State,
     return "Unsupported opcode: isBranch/isIndirectBranch";
   if (InstrDesc.isCall() || InstrDesc.isReturn())
     return "Unsupported opcode: isCall/isReturn";
+  if (InstrDesc.getSchedClass() == 0)
+    return "Unsupported opcode: No Sched Class";
   return nullptr;
 }
 
diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 94a44e3afccb4..85c4165de4aa9 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -12,6 +12,7 @@
 #include "llvm-readobj.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
@@ -228,8 +229,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
     W.indent();
     auto DumpOpts = DIDumpOptions();
     DumpOpts.IsEH = true;
-    Entry.cfis().dump(W.getOStream(), DumpOpts, W.getIndentLevel(),
-                      InitialLocation);
+    printCFIProgram(Entry.cfis(), W.getOStream(), DumpOpts, W.getIndentLevel(),
+                    InitialLocation);
     W.unindent();
     W.unindent();
     W.getOStream() << "\n";
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 1b590aa33bd86..2162588aadfdb 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -388,6 +388,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
   SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
 
+  SDValue Abs = DAG->getNode(ISD::ABS, DL, Int32VT, Op0);
+
   SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Trunc, Op0);
   SDValue Neg = DAG->getNegative(Op0, DL, Int32VT);
   SDValue Not = DAG->getNOT(DL, Op0, Int32VT);
@@ -417,6 +419,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(ZExt, m_SExtLike(m_Value())));
   EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
 
+  EXPECT_TRUE(sd_match(Abs, m_Abs(m_Specific(Op0))));
+
   EXPECT_TRUE(sd_match(Neg, m_Neg(m_Value())));
   EXPECT_TRUE(sd_match(Not, m_Not(m_Value())));
   EXPECT_FALSE(sd_match(ZExt, m_Neg(m_Value())));
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 2119642769e3d..4048143b36819 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(LLVMFrontendTests
   HLSLRootSignatureDumpTest.cpp
+  HLSLRootSignatureRangesTest.cpp
   OpenACCTest.cpp
   OpenMPContextTest.cpp
   OpenMPIRBuilderTest.cpp
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 90e6cd0a80d6b..1c37ee709e098 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -108,4 +108,193 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) {
   EXPECT_EQ(Out, Expected);
 }
 
+TEST(HLSLRootSignatureTest, RootCBVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::CBuffer;
+  Descriptor.Reg = {RegisterType::BReg, 0};
+  Descriptor.setDefaultFlags();
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected = "RootCBV(b0, space = 0, "
+                         "visibility = All, "
+                         "flags = DataStaticWhileSetAtExecute)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootSRVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::SRV;
+  Descriptor.Reg = {RegisterType::TReg, 0};
+  Descriptor.Space = 42;
+  Descriptor.Visibility = ShaderVisibility::Geometry;
+  Descriptor.Flags = RootDescriptorFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootSRV(t0, space = 42, visibility = Geometry, flags = None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootUAVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::UAV;
+  Descriptor.Reg = {RegisterType::UReg, 92374};
+  Descriptor.Space = 932847;
+  Descriptor.Visibility = ShaderVisibility::Hull;
+  Descriptor.Flags = RootDescriptorFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootUAV(u92374, space = 932847, visibility = Hull, flags = "
+      "DataVolatile | "
+      "DataStaticWhileSetAtExecute | "
+      "DataStatic)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefaultStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = Anisotropic, "
+                         "addressU = Wrap, "
+                         "addressV = Wrap, "
+                         "addressW = Wrap, "
+                         "mipLODBias = 0.000000e+00, "
+                         "maxAnisotropy = 16, "
+                         "comparisonFunc = LessEqual, "
+                         "borderColor = OpaqueWhite, "
+                         "minLOD = 0.000000e+00, "
+                         "maxLOD = 3.402823e+38, "
+                         "space = 0, "
+                         "visibility = All"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  Sampler.Filter = SamplerFilter::ComparisonMinMagLinearMipPoint;
+  Sampler.AddressU = TextureAddressMode::Mirror;
+  Sampler.AddressV = TextureAddressMode::Border;
+  Sampler.AddressW = TextureAddressMode::Clamp;
+  Sampler.MipLODBias = 4.8f;
+  Sampler.MaxAnisotropy = 32;
+  Sampler.CompFunc = ComparisonFunc::NotEqual;
+  Sampler.BorderColor = StaticBorderColor::OpaqueBlack;
+  Sampler.MinLOD = 1.0f;
+  Sampler.MaxLOD = 32.0f;
+  Sampler.Space = 7;
+  Sampler.Visibility = ShaderVisibility::Domain;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = ComparisonMinMagLinearMipPoint, "
+                         "addressU = Mirror, "
+                         "addressV = Border, "
+                         "addressW = Clamp, "
+                         "mipLODBias = 4.800000e+00, "
+                         "maxAnisotropy = 32, "
+                         "comparisonFunc = NotEqual, "
+                         "borderColor = OpaqueBlack, "
+                         "minLOD = 1.000000e+00, "
+                         "maxLOD = 3.200000e+01, "
+                         "space = 7, "
+                         "visibility = Domain"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefaultRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 1;
+  Constants.Reg = {RegisterType::BReg, 3};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 1, b3, space = 0, "
+                         "visibility = All)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, SetRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 983;
+  Constants.Reg = {RegisterType::BReg, 34593};
+  Constants.Space = 7;
+  Constants.Visibility = ShaderVisibility::Pixel;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 983, b34593, "
+                         "space = 7, visibility = Pixel)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, NoneRootFlagsDump) {
+  RootFlags Flags = RootFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags(None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, AllRootFlagsDump) {
+  RootFlags Flags = RootFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags("
+                         "AllowInputAssemblerInputLayout | "
+                         "DenyVertexShaderRootAccess | "
+                         "DenyHullShaderRootAccess | "
+                         "DenyDomainShaderRootAccess | "
+                         "DenyGeometryShaderRootAccess | "
+                         "DenyPixelShaderRootAccess | "
+                         "AllowStreamOutput | "
+                         "LocalRootSignature | "
+                         "DenyAmplificationShaderRootAccess | "
+                         "DenyMeshShaderRootAccess | "
+                         "CBVSRVUAVHeapDirectlyIndexed | "
+                         "SamplerHeapDirectlyIndexed)";
+  EXPECT_EQ(Out, Expected);
+}
+
 } // namespace
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
new file mode 100644
index 0000000000000..0ef6fe84f0ec9
--- /dev/null
+++ b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
@@ -0,0 +1,177 @@
+//===------ HLSLRootSignatureRangeTest.cpp - RootSignature Range tests ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/HLSL/HLSLRootSignatureUtils.h"
+#include "gtest/gtest.h"
+
+using namespace llvm::hlsl::rootsig;
+
+namespace {
+
+TEST(HLSLRootSignatureTest, NoOverlappingInsertTests) {
+  // Ensures that there is never a reported overlap
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 3;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 7;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 10;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C), std::nullopt);
+
+  // A = [0;3]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+
+  // B = [4;7]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(7), &B);
+
+  EXPECT_EQ(Range.lookup(8), nullptr);
+  EXPECT_EQ(Range.lookup(9), nullptr);
+
+  // C = [10;unbounded]
+  EXPECT_EQ(Range.lookup(10), &C);
+  EXPECT_EQ(Range.lookup(42), &C);
+  EXPECT_EQ(Range.lookup(98237423), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, SingleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when we insert a range that
+  // overlaps with one other range but does not cover (replace) it
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 1;
+  A.UpperBound = 5;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 0;
+  B.UpperBound = 2;
+  EXPECT_EQ(Range.insert(B).value(), &A);
+
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [1;5]
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+  EXPECT_EQ(Range.lookup(4), &A);
+  EXPECT_EQ(Range.lookup(5), &A);
+
+  // B = [0;0]
+  EXPECT_EQ(Range.lookup(0), &B);
+
+  // C = [6; unbounded]
+  EXPECT_EQ(Range.lookup(6), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, MultipleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // overlaps more than one range and it does not cover (replace) either
+  // range. In this case it will just fill in the interval between the two
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 6;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 1;
+  C.UpperBound = 5;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [0;2]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+
+  // B = [4;6]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(6), &B);
+
+  // C = [3;3]
+  EXPECT_EQ(Range.lookup(3), &C);
+}
+
+TEST(HLSLRootSignatureTest, CoverInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // covers one or more ranges
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 5;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  // Covers B
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = 6;
+  EXPECT_EQ(Range.insert(C).value(), &B);
+
+  // A = [0;2]
+  // C = [4;6] <- covers reference to B
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), nullptr);
+  EXPECT_EQ(Range.lookup(4), &C);
+  EXPECT_EQ(Range.lookup(5), &C);
+  EXPECT_EQ(Range.lookup(6), &C);
+
+  // Covers all other ranges
+  RangeInfo D;
+  D.LowerBound = 0;
+  D.UpperBound = 7;
+  EXPECT_EQ(Range.insert(D).value(), &A);
+
+  // D = [0;7] <- Covers reference to A and C
+  EXPECT_EQ(Range.lookup(0), &D);
+  EXPECT_EQ(Range.lookup(1), &D);
+  EXPECT_EQ(Range.lookup(2), &D);
+  EXPECT_EQ(Range.lookup(3), &D);
+  EXPECT_EQ(Range.lookup(4), &D);
+  EXPECT_EQ(Range.lookup(5), &D);
+  EXPECT_EQ(Range.lookup(6), &D);
+  EXPECT_EQ(Range.lookup(7), &D);
+}
+
+} // namespace
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 493e673d6a07d..1073df95c379a 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -896,17 +896,6 @@ TEST(ELFObjectFileTest, InvalidDecodePGOAnalysisMap) {
             "are enabled: version = 1 feature = 4");
   }
 
-  SmallString<128> CommonVersionedYamlString(CommonYamlString);
-  CommonVersionedYamlString += R"(
-      - Version: 2
-        BBRanges:
-          - BBEntries:
-              - ID:            1
-                AddressOffset: 0x0
-                Size:          0x1
-                Metadata:      0x2
-)";
-
   // Check that we fail when function entry count is enabled but not provided.
   SmallString<128> MissingFuncEntryCount(CommonYamlString);
   MissingFuncEntryCount += R"(
diff --git a/llvm/unittests/Target/AArch64/CMakeLists.txt b/llvm/unittests/Target/AArch64/CMakeLists.txt
index 67eb508e9bab8..9387ca90dd31a 100644
--- a/llvm/unittests/Target/AArch64/CMakeLists.txt
+++ b/llvm/unittests/Target/AArch64/CMakeLists.txt
@@ -16,6 +16,7 @@ set(LLVM_LINK_COMPONENTS
   GlobalISel
   MC
   MIRParser
+  Passes
   SelectionDAG
   Support
   Target
diff --git a/llvm/unittests/Target/LoongArch/CMakeLists.txt b/llvm/unittests/Target/LoongArch/CMakeLists.txt
index 6e7e49b4cb4e0..c3d33418a03aa 100644
--- a/llvm/unittests/Target/LoongArch/CMakeLists.txt
+++ b/llvm/unittests/Target/LoongArch/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   LoongArchCodeGen
   LoongArchDesc
   LoongArchInfo
+  Instrumentation
   MC
   MIRParser
   SelectionDAG
diff --git a/llvm/unittests/Target/RISCV/CMakeLists.txt b/llvm/unittests/Target/RISCV/CMakeLists.txt
index 10d6412f9b358..8da8c3896faf1 100644
--- a/llvm/unittests/Target/RISCV/CMakeLists.txt
+++ b/llvm/unittests/Target/RISCV/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   MC
+  Passes
   SelectionDAG
   TargetParser
   )
diff --git a/llvm/unittests/Target/SPIRV/CMakeLists.txt b/llvm/unittests/Target/SPIRV/CMakeLists.txt
index d7f0290089c4c..29b31b16094a0 100644
--- a/llvm/unittests/Target/SPIRV/CMakeLists.txt
+++ b/llvm/unittests/Target/SPIRV/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   BinaryFormat
   Core
+  Passes
   SPIRVCodeGen
   SPIRVAnalysis
   Support
diff --git a/llvm/unittests/Target/VE/CMakeLists.txt b/llvm/unittests/Target/VE/CMakeLists.txt
index 271bf07f5b5d7..de823306a9aed 100644
--- a/llvm/unittests/Target/VE/CMakeLists.txt
+++ b/llvm/unittests/Target/VE/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   GlobalISel
+  Instrumentation
   MC
   SelectionDAG
   Support
diff --git a/llvm/unittests/Target/WebAssembly/CMakeLists.txt b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
index b1e01169e7a06..b1e180d218c1f 100644
--- a/llvm/unittests/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   CodeGenTypes
   Core
+  Instrumentation
   MC
   MIRParser
   TargetParser
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 29bfa30848ec9..66e335a33a3f7 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -684,9 +684,9 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   for (StringRef Input :
        {"rv64i_xqcia0p7", "rv64i_xqciac0p3", "rv64i_xqcibi0p2",
         "rv64i_xqcibm0p8", "rv64i_xqcicli0p3", "rv64i_xqcicm0p2",
-        "rv64i_xqcics0p2", "rv64i_xqcicsr0p3", "rv64i_xqciint0p7",
+        "rv64i_xqcics0p2", "rv64i_xqcicsr0p4", "rv64i_xqciint0p10",
         "rv64i_xqciio0p1", "rv64i_xqcilb0p2", "rv64i_xqcili0p2",
-        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p5",
+        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p6",
         "rv64i_xqcisim0p2", "rv64i_xqcisls0p2", "rv64i_xqcisync0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
@@ -695,13 +695,13 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
 
   for (StringRef Input :
        {"rv32idc_xqciac0p3", "rv32i_zcd_xqciac0p3", "rv32idc_xqcicm0p2",
-        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p1", "rv32i_zcd_xqccmp0p1"}) {
+        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p3", "rv32i_zcd_xqccmp0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith("extension when 'd' extension is enabled"));
   }
 
-  for (StringRef Input : {"rv32i_zcmp_xqccmp0p1", "rv64i_zcmp_xqccmp0p1"}) {
+  for (StringRef Input : {"rv32i_zcmp_xqccmp0p3", "rv64i_zcmp_xqccmp0p3"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "'zcmp' and 'xqccmp' extensions are incompatible");
   }
@@ -1130,6 +1130,7 @@ R"(All available -march extensions for RISC-V
     svpbmt               1.0
     svvptc               1.0
     xandesperf           5.0
+    xandesvbfhcvt        5.0
     xandesvdot           5.0
     xandesvpackfph       5.0
     xcvalu               1.0
@@ -1184,7 +1185,7 @@ Experimental extensions
     smctr                1.0
     ssctr                1.0
     svukte               0.3
-    xqccmp               0.1
+    xqccmp               0.3
     xqcia                0.7
     xqciac               0.3
     xqcibi               0.2
@@ -1192,14 +1193,14 @@ Experimental extensions
     xqcicli              0.3
     xqcicm               0.2
     xqcics               0.2
-    xqcicsr              0.3
-    xqciint              0.7
+    xqcicsr              0.4
+    xqciint              0.10
     xqciio               0.1
     xqcilb               0.2
     xqcili               0.2
     xqcilia              0.2
     xqcilo               0.3
-    xqcilsm              0.5
+    xqcilsm              0.6
     xqcisim              0.2
     xqcisls              0.2
     xqcisync             0.3
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index f4c93334ac682..c4efb991ab6fd 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1831,6 +1831,22 @@ TEST_P(AArch64ExtensionDependenciesBaseCPUTestFixture,
   }
 }
 
+TEST(TargetParserTest, testAArch64ReconstructFromParsedFeatures) {
+  AArch64::ExtensionSet Extensions;
+  std::vector<std::string> FeatureOptions = {
+      "-sve2", "-Baz", "+sve", "+FooBar", "+sve2", "+neon", "-sve",
+  };
+  std::vector<std::string> NonExtensions;
+  Extensions.reconstructFromParsedFeatures(FeatureOptions, NonExtensions);
+
+  std::vector<std::string> NonExtensionsExpected = {"-Baz", "+FooBar"};
+  ASSERT_THAT(NonExtensions, testing::ContainerEq(NonExtensionsExpected));
+  std::vector<StringRef> Features;
+  Extensions.toLLVMFeatureList(Features);
+  std::vector<StringRef> FeaturesExpected = {"+neon", "-sve", "+sve2"};
+  ASSERT_THAT(Features, testing::ContainerEq(FeaturesExpected));
+}
+
 AArch64ExtensionDependenciesBaseArchTestParams
     AArch64ExtensionDependenciesArchData[] = {
         // Base architecture features
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 32098e96ce721..b6d9c9f3a1584 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -388,7 +388,7 @@ struct MatchableInfo {
     StringRef Token;
 
     /// The unique class instance this operand should match.
-    ClassInfo *Class;
+    ClassInfo *Class = nullptr;
 
     /// The operand name this is, if anything.
     StringRef SrcOpName;
@@ -397,18 +397,17 @@ struct MatchableInfo {
     StringRef OrigSrcOpName;
 
     /// The suboperand index within SrcOpName, or -1 for the entire operand.
-    int SubOpIdx;
+    int SubOpIdx = -1;
 
     /// Whether the token is "isolated", i.e., it is preceded and followed
     /// by separators.
     bool IsIsolatedToken;
 
     /// Register record if this token is singleton register.
-    const Record *SingletonReg;
+    const Record *SingletonReg = nullptr;
 
     explicit AsmOperand(bool IsIsolatedToken, StringRef T)
-        : Token(T), Class(nullptr), SubOpIdx(-1),
-          IsIsolatedToken(IsIsolatedToken), SingletonReg(nullptr) {}
+        : Token(T), IsIsolatedToken(IsIsolatedToken) {}
   };
 
   /// ResOperand - This represents a single operand in the result instruction
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 810b35e65b310..3a4ca1b451567 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3604,16 +3604,14 @@ class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
 
 public:
-  bool hasSideEffects;
-  bool mayStore;
-  bool mayLoad;
-  bool isBitcast;
-  bool isVariadic;
-  bool hasChain;
-
-  InstAnalyzer(const CodeGenDAGPatterns &cdp)
-      : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-        isBitcast(false), isVariadic(false), hasChain(false) {}
+  bool hasSideEffects = false;
+  bool mayStore = false;
+  bool mayLoad = false;
+  bool isBitcast = false;
+  bool isVariadic = false;
+  bool hasChain = false;
+
+  InstAnalyzer(const CodeGenDAGPatterns &cdp) : CDP(cdp) {}
 
   void Analyze(const PatternToMatch &Pat) {
     const TreePatternNode &N = Pat.getSrcPattern();
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 0dfcf200d7e4b..2ec3683e116e9 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -435,7 +435,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 CodeGenInstruction::CodeGenInstruction(const Record *R)
     : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
-  AsmString = R->getValueAsString("AsmString").str();
+  AsmString = R->getValueAsString("AsmString");
 
   isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
   isReturn = R->getValueAsBit("isReturn");
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index 3a5abc55319b1..0db12b551b437 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -226,7 +226,7 @@ class CodeGenInstruction {
 
   /// AsmString - The format string used to emit a .s file for the
   /// instruction.
-  std::string AsmString;
+  StringRef AsmString;
 
   /// Operands - This is information about the (ins) and (outs) list specified
   /// to the instruction.
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index f52c21e97f9c8..57a243158692b 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -164,8 +164,8 @@ CodeGenRegister::CodeGenRegister(const Record *R, unsigned Enum)
     : TheDef(R), EnumValue(Enum),
       CostPerUse(R->getValueAsListOfInts("CostPerUse")),
       CoveredBySubRegs(R->getValueAsBit("CoveredBySubRegs")),
-      HasDisjunctSubRegs(false), Constant(R->getValueAsBit("isConstant")),
-      SubRegsComplete(false), SuperRegsComplete(false), TopoSig(~0u) {
+      Constant(R->getValueAsBit("isConstant")), SubRegsComplete(false),
+      SuperRegsComplete(false), TopoSig(~0u) {
   Artificial = R->getValueAsBit("isArtificial");
 }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 3f4c157fab69a..bbcd44ce2cc5b 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -564,7 +564,7 @@ struct RegUnit {
   // Weight assigned to this RegUnit for estimating register pressure.
   // This is useful when equalizing weights in register classes with mixed
   // register topologies.
-  unsigned Weight;
+  unsigned Weight = 0;
 
   // Each native RegUnit corresponds to one or two root registers. The full
   // set of registers containing this unit can be computed as the union of
@@ -573,14 +573,12 @@ struct RegUnit {
 
   // Index into RegClassUnitSets where we can find the list of UnitSets that
   // contain this unit.
-  unsigned RegClassUnitSetsIdx;
+  unsigned RegClassUnitSetsIdx = 0;
   // A register unit is artificial if at least one of its roots is
   // artificial.
-  bool Artificial;
+  bool Artificial = false;
 
-  RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
-    Roots[0] = Roots[1] = nullptr;
-  }
+  RegUnit() { Roots[0] = Roots[1] = nullptr; }
 
   ArrayRef<const CodeGenRegister *> getRoots() const {
     assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h
index 697a1ce8f75ac..1d5e953cf70c7 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h
@@ -193,7 +193,7 @@ struct CodeGenRegisterFile {
   unsigned MaxMovesEliminatedPerCycle;
   bool AllowZeroMoveEliminationOnly;
 
-  unsigned NumPhysRegs;
+  unsigned NumPhysRegs = 0;
   std::vector<CodeGenRegisterCost> Costs;
 
   CodeGenRegisterFile(StringRef name, const Record *def,
@@ -201,7 +201,7 @@ struct CodeGenRegisterFile {
                       bool AllowZeroMoveElimOnly = false)
       : Name(name), RegisterFileDef(def),
         MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
-        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {}
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -261,16 +261,16 @@ struct CodeGenProcModel {
   std::vector<CodeGenRegisterFile> RegisterFiles;
 
   // Optional Retire Control Unit definition.
-  const Record *RetireControlUnit;
+  const Record *RetireControlUnit = nullptr;
 
   // Load/Store queue descriptors.
-  const Record *LoadQueue;
-  const Record *StoreQueue;
+  const Record *LoadQueue = nullptr;
+  const Record *StoreQueue = nullptr;
 
   CodeGenProcModel(unsigned Idx, std::string Name, const Record *MDef,
                    const Record *IDef)
-      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-        RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
+      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef) {
+  }
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 66472576eea8f..620f88db66109 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -501,13 +501,13 @@ class RuleMatcher : public Matcher {
 
   /// ID for the next instruction variable defined with
   /// implicitlyDefineInsnVar()
-  unsigned NextInsnVarID;
+  unsigned NextInsnVarID = 0;
 
   /// ID for the next output instruction allocated with allocateOutputInsnID()
-  unsigned NextOutputInsnID;
+  unsigned NextOutputInsnID = 0;
 
   /// ID for the next temporary register ID allocated with allocateTempRegID()
-  unsigned NextTempRegID;
+  unsigned NextTempRegID = 0;
 
   /// ID for the next recorded type. Starts at -1 and counts down.
   TempTypeIdx NextTempTypeIdx = -1;
@@ -545,9 +545,7 @@ class RuleMatcher : public Matcher {
                              StringRef FlagName, GISelFlags FlagBit);
 
 public:
-  RuleMatcher(ArrayRef<SMLoc> SrcLoc)
-      : NextInsnVarID(0), NextOutputInsnID(0), NextTempRegID(0), SrcLoc(SrcLoc),
-        RuleID(NextRuleID++) {}
+  RuleMatcher(ArrayRef<SMLoc> SrcLoc) : SrcLoc(SrcLoc), RuleID(NextRuleID++) {}
   RuleMatcher(RuleMatcher &&Other) = default;
   RuleMatcher &operator=(RuleMatcher &&Other) = default;
 
@@ -2039,12 +2037,12 @@ class CopyConstantAsImmRenderer : public OperandRenderer {
   unsigned NewInsnID;
   /// The name of the operand.
   const std::string SymbolicName;
-  bool Signed;
+  bool Signed = true;
 
 public:
   CopyConstantAsImmRenderer(unsigned NewInsnID, StringRef SymbolicName)
       : OperandRenderer(OR_CopyConstantAsImm), NewInsnID(NewInsnID),
-        SymbolicName(SymbolicName), Signed(true) {}
+        SymbolicName(SymbolicName) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_CopyConstantAsImm;
@@ -2359,7 +2357,7 @@ class BuildMIAction : public MatchAction {
 private:
   unsigned InsnID;
   const CodeGenInstruction *I;
-  InstructionMatcher *Matched;
+  InstructionMatcher *Matched = nullptr;
   std::vector<std::unique_ptr<OperandRenderer>> OperandRenderers;
   SmallPtrSet<const Record *, 4> DeadImplicitDefs;
 
@@ -2372,7 +2370,7 @@ class BuildMIAction : public MatchAction {
 
 public:
   BuildMIAction(unsigned InsnID, const CodeGenInstruction *I)
-      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I), Matched(nullptr) {}
+      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I) {}
 
   static bool classof(const MatchAction *A) {
     return A->getKind() == AK_BuildMI;
diff --git a/llvm/utils/TableGen/Common/PredicateExpander.h b/llvm/utils/TableGen/Common/PredicateExpander.h
index 0c3a8718a473f..4439327af2b03 100644
--- a/llvm/utils/TableGen/Common/PredicateExpander.h
+++ b/llvm/utils/TableGen/Common/PredicateExpander.h
@@ -25,9 +25,9 @@ namespace llvm {
 class Record;
 
 class PredicateExpander {
-  bool EmitCallsByRef;
-  bool NegatePredicate;
-  bool ExpandForMC;
+  bool EmitCallsByRef = true;
+  bool NegatePredicate = false;
+  bool ExpandForMC = false;
   StringRef TargetName;
 
   PredicateExpander(const PredicateExpander &) = delete;
@@ -38,8 +38,7 @@ class PredicateExpander {
 
 public:
   explicit PredicateExpander(StringRef Target, unsigned Indent = 1)
-      : EmitCallsByRef(true), NegatePredicate(false), ExpandForMC(false),
-        TargetName(Target), Indent(Indent, 2) {}
+      : TargetName(Target), Indent(Indent, 2) {}
   bool isByRef() const { return EmitCallsByRef; }
   bool shouldNegate() const { return NegatePredicate; }
   bool shouldExpandForMC() const { return ExpandForMC; }
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 0039ff4f3e2d7..227311b0a3bc8 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -76,7 +76,7 @@ class MatcherGen {
   /// NextRecordedOperandNo - As we emit opcodes to record matched values in
   /// the RecordedNodes array, this keeps track of which slot will be next to
   /// record into.
-  unsigned NextRecordedOperandNo;
+  unsigned NextRecordedOperandNo = 0;
 
   /// MatchedChainNodes - This maintains the position in the recorded nodes
   /// array of all of the recorded input nodes that have chains.
@@ -94,11 +94,11 @@ class MatcherGen {
   SmallVector<std::pair<const Record *, unsigned>, 2> PhysRegInputs;
 
   /// Matcher - This is the top level of the generated matcher, the result.
-  Matcher *TheMatcher;
+  Matcher *TheMatcher = nullptr;
 
   /// CurPredicate - As we emit matcher nodes, this points to the latest check
   /// which should have future checks stuck into its Next position.
-  Matcher *CurPredicate;
+  Matcher *CurPredicate = nullptr;
 
 public:
   MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
@@ -147,8 +147,7 @@ class MatcherGen {
 
 MatcherGen::MatcherGen(const PatternToMatch &pattern,
                        const CodeGenDAGPatterns &cgp)
-    : Pattern(pattern), CGP(cgp), NextRecordedOperandNo(0), TheMatcher(nullptr),
-      CurPredicate(nullptr) {
+    : Pattern(pattern), CGP(cgp) {
   // We need to produce the matcher tree for the patterns source pattern.  To
   // do this we need to match the structure as well as the types.  To do the
   // type matching, we want to figure out the fewest number of type checks we
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 56c3644c134f1..37814113b467a 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -104,10 +104,9 @@ struct OperandInfo {
   std::vector<EncodingField> Fields;
   std::string Decoder;
   bool HasCompleteDecoder;
-  uint64_t InitValue;
+  uint64_t InitValue = 0;
 
-  OperandInfo(std::string D, bool HCD)
-      : Decoder(D), HasCompleteDecoder(HCD), InitValue(0) {}
+  OperandInfo(std::string D, bool HCD) : Decoder(D), HasCompleteDecoder(HCD) {}
 
   void addField(unsigned Base, unsigned Width, unsigned Offset) {
     Fields.push_back(EncodingField(Base, Width, Offset));
@@ -223,10 +222,11 @@ class DecoderEmitter {
   DecoderEmitter(const RecordKeeper &R, StringRef PredicateNamespace)
       : RK(R), Target(R), PredicateNamespace(PredicateNamespace) {}
 
-  // Emit the decoder state machine table.
-  void emitTable(formatted_raw_ostream &OS, DecoderTable &Table, indent Indent,
-                 unsigned BitWidth, StringRef Namespace,
-                 const EncodingIDsVec &EncodingIDs) const;
+  // Emit the decoder state machine table. Returns a mask of MCD decoder ops
+  // that were emitted.
+  unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
+                     indent Indent, unsigned BitWidth, StringRef Namespace,
+                     const EncodingIDsVec &EncodingIDs) const;
   void emitInstrLenTable(formatted_raw_ostream &OS,
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
@@ -827,11 +827,12 @@ unsigned Filter::usefulness() const {
 //                              //
 //////////////////////////////////
 
-// Emit the decoder state machine table.
-void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
-                               indent Indent, unsigned BitWidth,
-                               StringRef Namespace,
-                               const EncodingIDsVec &EncodingIDs) const {
+// Emit the decoder state machine table. Returns a mask of MCD decoder ops
+// that were emitted.
+unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
+                                   DecoderTable &Table, indent Indent,
+                                   unsigned BitWidth, StringRef Namespace,
+                                   const EncodingIDsVec &EncodingIDs) const {
   // We'll need to be able to map from a decoded opcode into the corresponding
   // EncodingID for this specific combination of BitWidth and Namespace. This
   // is used below to index into NumberedEncodings.
@@ -885,6 +886,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
       OS << " (Fail)";
   };
 
+  unsigned OpcodeMask = 0;
+
   while (I != E) {
     assert(I < E && "incomplete decode table entry!");
 
@@ -893,6 +896,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
     OS.PadToColumn(12);
 
     const uint8_t DecoderOp = *I++;
+    OpcodeMask |= (1 << DecoderOp);
     switch (DecoderOp) {
     default:
       PrintFatalError("Invalid decode table opcode: " + Twine((int)DecoderOp) +
@@ -1028,6 +1032,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
   Indent -= 2;
 
   OS << Indent << "};\n\n";
+
+  return OpcodeMask;
 }
 
 void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
@@ -1046,19 +1052,13 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   OS << Indent << "static bool checkDecoderPredicate(unsigned Idx, "
      << "const FeatureBitset &Bits) {\n";
   Indent += 2;
-  if (!Predicates.empty()) {
-    OS << Indent << "switch (Idx) {\n";
-    OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
-    unsigned Index = 0;
-    for (const auto &Predicate : Predicates) {
-      OS << Indent << "case " << Index++ << ":\n";
-      OS << Indent + 2 << "return (" << Predicate << ");\n";
-    }
-    OS << Indent << "}\n";
-  } else {
-    // No case statement to emit
-    OS << Indent << "llvm_unreachable(\"Invalid index!\");\n";
+  OS << Indent << "switch (Idx) {\n";
+  OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
+  for (const auto &[Index, Predicate] : enumerate(Predicates)) {
+    OS << Indent << "case " << Index << ":\n";
+    OS << Indent + 2 << "return (" << Predicate << ");\n";
   }
+  OS << Indent << "}\n";
   Indent -= 2;
   OS << Indent << "}\n\n";
 }
@@ -2218,8 +2218,15 @@ static void insertBits(InsnType &field, InsnType bits, unsigned startBit,
 
 // emitDecodeInstruction - Emit the templated helper function
 // decodeInstruction().
-static void emitDecodeInstruction(formatted_raw_ostream &OS,
-                                  bool IsVarLenInst) {
+static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
+                                  unsigned OpcodeMask) {
+  const bool HasTryDecode = OpcodeMask & ((1 << MCD::OPC_TryDecode) |
+                                          (1 << MCD::OPC_TryDecodeOrFail));
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+  const bool HasSoftFail = OpcodeMask & (1 << MCD::OPC_SoftFail);
+
   OS << R"(
 static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
   unsigned NumToSkip = *Ptr++;
@@ -2239,9 +2246,11 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     OS << ",\n                                      "
           "llvm::function_ref<void(APInt &, uint64_t)> makeUp";
   }
-  OS << R"() {
-  const FeatureBitset &Bits = STI.getFeatureBits();
+  OS << ") {\n";
+  if (HasCheckPredicate)
+    OS << "  const FeatureBitset &Bits = STI.getFeatureBits();\n";
 
+  OS << R"(
   const uint8_t *Ptr = DecodeTable;
   uint64_t CurFieldValue = 0;
   DecodeStatus S = MCDisassembler::Success;
@@ -2322,7 +2331,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  if (HasCheckPredicate) {
+    OS << R"(
     case MCD::OPC_CheckPredicate:
     case MCD::OPC_CheckPredicateOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_CheckPredicateOrFail;
@@ -2344,7 +2355,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  }
+  OS << R"(
     case MCD::OPC_Decode: {
       // Decode the Opcode value.
       unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
@@ -2365,7 +2378,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
                    << ", using decoder " << DecodeIdx << ": "
                    << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
       return S;
-    }
+    })";
+  if (HasTryDecode) {
+    OS << R"(
     case MCD::OPC_TryDecode:
     case MCD::OPC_TryDecodeOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_TryDecodeOrFail;
@@ -2400,17 +2415,22 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       // set before the decode attempt.
       S = MCDisassembler::Success;
       break;
-    }
-    case MCD::OPC_SoftFail: {
-      // Decode the mask values.
-      uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
-      uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
-      bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
-      if (Failed)
-        S = MCDisassembler::SoftFail;
-      LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
-      break;
-    }
+    })";
+  }
+  if (HasSoftFail) {
+    OS << R"(
+      case MCD::OPC_SoftFail: {
+        // Decode the mask values.
+        uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
+        uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
+        bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
+        if (Failed)
+          S = MCDisassembler::SoftFail;
+        LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
+        break;
+  })";
+  }
+  OS << R"(
     case MCD::OPC_Fail: {
       LLVM_DEBUG(dbgs() << Loc << ": OPC_Fail\n");
       return MCDisassembler::Fail;
@@ -2610,6 +2630,7 @@ namespace {
   }
 
   DecoderTableInfo TableInfo;
+  unsigned OpcodeMask = 0;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
     ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
@@ -2635,8 +2656,8 @@ namespace {
     TableInfo.Table.push_back(MCD::OPC_Fail);
 
     // Print the table to the output stream.
-    emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(), Opc.first.first,
-              Opc.second);
+    OpcodeMask |= emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(),
+                            Opc.first.first, Opc.second);
   }
 
   // For variable instruction, we emit a instruction length table
@@ -2644,14 +2665,20 @@ namespace {
   // You can see example usage in M68k's disassembler.
   if (IsVarLenInst)
     emitInstrLenTable(OS, InstrLen);
+
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+
   // Emit the predicate function.
-  emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
+  if (HasCheckPredicate)
+    emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
 
   // Emit the decoder function.
   emitDecoderFunction(OS, TableInfo.Decoders, indent(0));
 
   // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS, IsVarLenInst);
+  emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 
   OS << "\n} // namespace\n";
 }
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index a8b6f79c176a7..694d89a5ada3c 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -86,10 +86,10 @@ namespace {
 struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
-    char Repr;
+    char Repr = OK_Invalid;
 
   public:
-    OpKind() : Repr(OK_Invalid) {}
+    OpKind() {}
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 15b57f7d85fc7..6dcce2db37964 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -92,6 +92,7 @@ write_lit_cfg("lit_api_site_cfg") {
     "LLDB_FRAMEWORK_DIR=XXX_framework_dir",
     "CMAKE_CXX_COMPILER=c++",  # XXX use bin/clang++ instead?
     "HOST_OS=$host_os",  # XXX
+    "Python3_ROOT_DIR=",  # FIXME
   ]
 
   if (is_debug) {
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index ca05ac1b24647..f4ee2599c01ce 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -223,6 +223,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
+      "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
       "HAVE_UNISTD_H=",
       "HAVE__CHSIZE_S=1",
@@ -250,6 +251,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
+      "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
       "HAVE_UNISTD_H=1",
       "HAVE__CHSIZE_S=",
@@ -298,8 +300,8 @@ write_cmake_config("llvm-config") {
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_TELEMETRY=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
-    "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=",
-    "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=",
     "LLVM_ENABLE_DUMP=",
     "LLVM_ENABLE_HTTPLIB=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",
diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
index fc071e5471d01..cb46f7cf55fe0 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
@@ -11,6 +11,7 @@ static_library("DWARF") {
     "DWARFAbbreviationDeclaration.cpp",
     "DWARFAcceleratorTable.cpp",
     "DWARFAddressRange.cpp",
+    "DWARFCFIPrinter.cpp",
     "DWARFCFIProgram.cpp",
     "DWARFCompileUnit.cpp",
     "DWARFContext.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
index a10a0d5637e95..87c02122e0f63 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
@@ -67,7 +67,6 @@ static_library("MCTargetDesc") {
     "MipsInstPrinter.cpp",
     "MipsMCAsmInfo.cpp",
     "MipsMCCodeEmitter.cpp",
-    "MipsMCExpr.cpp",
     "MipsMCTargetDesc.cpp",
     "MipsNaClELFStreamer.cpp",
     "MipsOptionRecord.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
index cd7d0671fbe71..7338fb159419a 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
@@ -14,6 +14,7 @@ unittest("LLVMFrontendTests") {
   ]
   sources = [
     "HLSLRootSignatureDumpTest.cpp",
+    "HLSLRootSignatureRangesTest.cpp",
     "OpenACCTest.cpp",
     "OpenMPCompositionTest.cpp",
     "OpenMPContextTest.cpp",
diff --git a/mlir/docs/Dialects/Vector.md b/mlir/docs/Dialects/Vector.md
index ade0068c56fb6..ebeb0a2de0ff1 100644
--- a/mlir/docs/Dialects/Vector.md
+++ b/mlir/docs/Dialects/Vector.md
@@ -1,5 +1,8 @@
 # 'vector' Dialect
 
+**Please post an RFC on the [forum](https://llvm.discourse.group/c/mlir/31)
+before adding  any operation in this dialect.**
+
 [TOC]
 
 MLIR supports multi-dimensional `vector` types and custom operations on those
diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index bf830a29613fd..5c538d28c1835 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -32,6 +32,20 @@ enum class BoundType;
 
 namespace affine {
 class AffineApplyOp;
+class AffineDelinearizeIndexOp;
+class AffineLinearizeIndexOp;
+
+/// Lowers `affine.delinearize_index` into a sequence of division and remainder
+/// operations.
+LogicalResult lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                            AffineDelinearizeIndexOp op);
+
+/// Lowers `affine.linearize_index` into a sequence of multiplications and
+/// additions. Make a best effort to sort the input indices so that
+/// the most loop-invariant terms are at the left of the additions
+/// to enable loop-invariant code motion.
+LogicalResult lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                          AffineLinearizeIndexOp op);
 
 /// Populate patterns that expand affine index operations into more fundamental
 /// operations (not necessarily restricted to Affine dialect).
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 48f12b46a57f1..79da81ba049dd 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -112,10 +112,11 @@ def BranchOp : CF_Op<"br", [
 // CondBranchOp
 //===----------------------------------------------------------------------===//
 
-def CondBranchOp : CF_Op<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
-     Pure, Terminator]> {
+def CondBranchOp
+    : CF_Op<"cond_br", [AttrSizedOperandSegments,
+                        DeclareOpInterfaceMethods<
+                            BranchOpInterface, ["getSuccessorForOperands"]>,
+                        WeightedBranchOpInterface, Pure, Terminator]> {
   let summary = "Conditional branch operation";
   let description = [{
     The `cf.cond_br` terminator operation represents a conditional branch on a
@@ -144,20 +145,23 @@ def CondBranchOp : CF_Op<"cond_br",
     ```
   }];
 
-  let arguments = (ins I1:$condition,
-                       Variadic<AnyType>:$trueDestOperands,
-                       Variadic<AnyType>:$falseDestOperands);
+  let arguments = (ins I1:$condition, Variadic<AnyType>:$trueDestOperands,
+      Variadic<AnyType>:$falseDestOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$branch_weights);
   let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
 
-  let builders = [
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "ValueRange":$trueOperands, "Block *":$falseDest,
-      "ValueRange":$falseOperands), [{
-      build($_builder, $_state, condition, trueOperands, falseOperands, trueDest,
+  let builders = [OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "ValueRange":$trueOperands,
+                                "Block *":$falseDest,
+                                "ValueRange":$falseOperands),
+                            [{
+      build($_builder, $_state, condition, trueOperands, falseOperands, /*branch_weights=*/{}, trueDest,
             falseDest);
     }]>,
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "Block *":$falseDest, CArg<"ValueRange", "{}">:$falseOperands), [{
+                  OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "Block *":$falseDest,
+                                CArg<"ValueRange", "{}">:$falseOperands),
+                            [{
       build($_builder, $_state, condition, trueDest, ValueRange(), falseDest,
             falseOperands);
     }]>];
@@ -216,7 +220,7 @@ def CondBranchOp : CF_Op<"cond_br",
 
   let hasCanonicalizer = 1;
   let assemblyFormat = [{
-    $condition `,`
+    $condition (`weights` `(` $branch_weights^ `)` )? `,`
     $trueDest (`(` $trueDestOperands^ `:` type($trueDestOperands) `)`)? `,`
     $falseDest (`(` $falseDestOperands^ `:` type($falseDestOperands) `)`)?
     attr-dict
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
index 610170f5944eb..299cee76cb1b4 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/EmitC/IR/CMakeLists.txt
@@ -1,6 +1,12 @@
 add_mlir_dialect(EmitC emitc)
 add_mlir_doc(EmitC EmitC Dialects/ -gen-dialect-doc -dialect emitc)
 
+set(LLVM_TARGET_DEFINITIONS EmitCInterfaces.td)
+mlir_tablegen(EmitCInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(EmitCInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(MLIREmitCInterfacesIncGen)
+add_dependencies(mlir-generic-headers MLIREmitCInterfacesIncGen)
+
 set(LLVM_TARGET_DEFINITIONS EmitCAttributes.td)
 mlir_tablegen(EmitCEnums.h.inc -gen-enum-decls)
 mlir_tablegen(EmitCEnums.cpp.inc -gen-enum-defs)
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
index 57029c64ffd00..1984ed8a7f068 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.h
@@ -14,7 +14,7 @@
 #define MLIR_DIALECT_EMITC_IR_EMITC_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/EmitC/IR/EmitCTraits.h"
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index d4aea52a0d485..9ecdb74f4d82e 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -14,6 +14,7 @@
 #define MLIR_DIALECT_EMITC_IR_EMITC
 
 include "mlir/Dialect/EmitC/IR/EmitCAttributes.td"
+include "mlir/Dialect/EmitC/IR/EmitCInterfaces.td"
 include "mlir/Dialect/EmitC/IR/EmitCTypes.td"
 
 include "mlir/Interfaces/CallInterfaces.td"
@@ -35,22 +36,31 @@ class EmitC_Op<string mnemonic, list<Trait> traits = []>
 
 // Base class for unary operations.
 class EmitC_UnaryOp<string mnemonic, list<Trait> traits = []> :
-    EmitC_Op<mnemonic, traits> {
+    EmitC_Op<mnemonic, !listconcat(traits, [CExpressionInterface])> {
   let arguments = (ins EmitCType);
   let results = (outs EmitCType);
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
 // Base class for binary operations.
 class EmitC_BinaryOp<string mnemonic, list<Trait> traits = []> :
-    EmitC_Op<mnemonic, traits> {
+    EmitC_Op<mnemonic, !listconcat(traits, [CExpressionInterface])> {
   let arguments = (ins EmitCType:$lhs, EmitCType:$rhs);
   let results = (outs EmitCType);
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
-}
 
-// EmitC OpTrait
-def CExpression : NativeOpTrait<"emitc::CExpression">;
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
+}
 
 // Types only used in binary arithmetic operations.
 def IntegerIndexOrOpaqueType : Type<CPred<"emitc::isIntegerIndexOrOpaqueType($_self)">,
@@ -103,7 +113,7 @@ def EmitC_FileOp
   let skipDefaultBuilders = 1;
 }
 
-def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
+def EmitC_AddOp : EmitC_BinaryOp<"add", []> {
   let summary = "Addition operation";
   let description = [{
     With the `emitc.add` operation the arithmetic operator + (addition) can
@@ -126,7 +136,7 @@ def EmitC_AddOp : EmitC_BinaryOp<"add", [CExpression]> {
   let hasVerifier = 1;
 }
 
-def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
+def EmitC_ApplyOp : EmitC_Op<"apply", [CExpressionInterface]> {
   let summary = "Apply operation";
   let description = [{
     With the `emitc.apply` operation the operators & (address of) and * (contents of)
@@ -152,10 +162,17 @@ def EmitC_ApplyOp : EmitC_Op<"apply", [CExpression]> {
   let assemblyFormat = [{
     $applicableOperator `(` $operand `)` attr-dict `:` functional-type($operand, results)
   }];
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return getApplicableOperator() == "*";
+    }
+  }];
+
   let hasVerifier = 1;
 }
 
-def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", [CExpression]> {
+def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", []> {
   let summary = "Bitwise and operation";
   let description = [{
     With the `emitc.bitwise_and` operation the bitwise operator & (and) can
@@ -173,8 +190,7 @@ def EmitC_BitwiseAndOp : EmitC_BinaryOp<"bitwise_and", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
-    [CExpression]> {
+def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift", []> {
   let summary = "Bitwise left shift operation";
   let description = [{
     With the `emitc.bitwise_left_shift` operation the bitwise operator <<
@@ -192,7 +208,7 @@ def EmitC_BitwiseLeftShiftOp : EmitC_BinaryOp<"bitwise_left_shift",
   }];
 }
 
-def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
+def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", []> {
   let summary = "Bitwise not operation";
   let description = [{
     With the `emitc.bitwise_not` operation the bitwise operator ~ (not) can
@@ -210,7 +226,7 @@ def EmitC_BitwiseNotOp : EmitC_UnaryOp<"bitwise_not", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", [CExpression]> {
+def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", []> {
   let summary = "Bitwise or operation";
   let description = [{
     With the `emitc.bitwise_or` operation the bitwise operator | (or)
@@ -228,8 +244,7 @@ def EmitC_BitwiseOrOp : EmitC_BinaryOp<"bitwise_or", [CExpression]> {
   }];
 }
 
-def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
-    [CExpression]> {
+def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift", []> {
   let summary = "Bitwise right shift operation";
   let description = [{
     With the `emitc.bitwise_right_shift` operation the bitwise operator >>
@@ -247,7 +262,7 @@ def EmitC_BitwiseRightShiftOp : EmitC_BinaryOp<"bitwise_right_shift",
   }];
 }
 
-def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
+def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", []> {
   let summary = "Bitwise xor operation";
   let description = [{
     With the `emitc.bitwise_xor` operation the bitwise operator ^ (xor)
@@ -265,7 +280,7 @@ def EmitC_BitwiseXorOp : EmitC_BinaryOp<"bitwise_xor", [CExpression]> {
   }];
 }
 
-def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> {
+def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpressionInterface]> {
   let summary = "Opaque call operation";
   let description = [{
     The `emitc.call_opaque` operation represents a C++ function call. The callee
@@ -312,7 +327,7 @@ def EmitC_CallOpaqueOp : EmitC_Op<"call_opaque", [CExpression]> {
 }
 
 def EmitC_CastOp : EmitC_Op<"cast",
-    [CExpression,
+    [CExpressionInterface,
      DeclareOpInterfaceMethods<CastOpInterface>]> {
   let summary = "Cast operation";
   let description = [{
@@ -335,9 +350,15 @@ def EmitC_CastOp : EmitC_Op<"cast",
   let arguments = (ins EmitCType:$source);
   let results = (outs EmitCType:$dest);
   let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
-def EmitC_CmpOp : EmitC_BinaryOp<"cmp", [CExpression]> {
+def EmitC_CmpOp : EmitC_BinaryOp<"cmp", []> {
   let summary = "Comparison operation";
   let description = [{
     With the `emitc.cmp` operation the comparison operators ==, !=, <, <=, >, >=, <=> 
@@ -407,7 +428,7 @@ def EmitC_ConstantOp : EmitC_Op<"constant", [ConstantLike]> {
   let hasVerifier = 1;
 }
 
-def EmitC_DivOp : EmitC_BinaryOp<"div", [CExpression]> {
+def EmitC_DivOp : EmitC_BinaryOp<"div", []> {
   let summary = "Division operation";
   let description = [{
     With the `emitc.div` operation the arithmetic operator / (division) can
@@ -462,7 +483,7 @@ def EmitC_ExpressionOp : EmitC_Op<"expression",
     ```
 
     The operations allowed within expression body are EmitC operations with the
-    CExpression trait.
+    CExpressionInterface interface.
 
     When specified, the optional `do_not_inline` indicates that the expression is
     to be emitted as seen above, i.e. as the rhs of an EmitC SSA value
@@ -480,18 +501,8 @@ def EmitC_ExpressionOp : EmitC_Op<"expression",
   let extraClassDeclaration = [{
     bool hasSideEffects() {
       auto predicate = [](Operation &op) {
-        assert(op.hasTrait<OpTrait::emitc::CExpression>() && "Expected a C expression");
-        // Conservatively assume calls to read and write memory.
-        if (isa<emitc::CallOpaqueOp>(op))
-          return true;
-        // De-referencing reads modifiable memory, address-taking has no
-        // side-effect.
-        auto applyOp = dyn_cast<emitc::ApplyOp>(op);
-        if (applyOp)
-          return applyOp.getApplicableOperator() == "*";
-        // Any load operation is assumed to read from memory and thus perform
-        // a side effect.
-        return isa<emitc::LoadOp>(op);
+        assert(isa<emitc::CExpressionInterface>(op) && "Expected a C expression");
+        return cast<emitc::CExpressionInterface>(op).hasSideEffects();
       };
       return llvm::any_of(getRegion().front().without_terminator(), predicate);
     };
@@ -579,7 +590,7 @@ def EmitC_ForOp : EmitC_Op<"for",
 }
 
 def EmitC_CallOp : EmitC_Op<"call",
-    [CallOpInterface, CExpression,
+    [CallOpInterface, CExpressionInterface,
      DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call operation";
   let description = [{
@@ -649,6 +660,10 @@ def EmitC_CallOp : EmitC_Op<"call",
     void setCalleeFromCallable(CallInterfaceCallable callee) {
       (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
     }
+
+    bool hasSideEffects() {
+      return false;
+    }
   }];
 
   let assemblyFormat = [{
@@ -861,7 +876,7 @@ def EmitC_LiteralOp : EmitC_Op<"literal", [Pure]> {
   let assemblyFormat = "$value attr-dict `:` type($result)";
 }
 
-def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
+def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", []> {
   let summary = "Logical and operation";
   let description = [{
     With the `emitc.logical_and` operation the logical operator && (and) can
@@ -882,7 +897,7 @@ def EmitC_LogicalAndOp : EmitC_BinaryOp<"logical_and", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
+def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", []> {
   let summary = "Logical not operation";
   let description = [{
     With the `emitc.logical_not` operation the logical operator ! (negation) can
@@ -903,7 +918,7 @@ def EmitC_LogicalNotOp : EmitC_UnaryOp<"logical_not", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
+def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", []> {
   let summary = "Logical or operation";
   let description = [{
     With the `emitc.logical_or` operation the logical operator || (inclusive or)
@@ -924,7 +939,7 @@ def EmitC_LogicalOrOp : EmitC_BinaryOp<"logical_or", [CExpression]> {
   let assemblyFormat = "operands attr-dict `:` type(operands)";
 }
 
-def EmitC_LoadOp : EmitC_Op<"load", [CExpression,
+def EmitC_LoadOp : EmitC_Op<"load", [CExpressionInterface,
   TypesMatchWith<"result type matches value type of 'operand'",
                   "operand", "result",
                   "::llvm::cast<LValueType>($_self).getValueType()">
@@ -953,7 +968,7 @@ def EmitC_LoadOp : EmitC_Op<"load", [CExpression,
   let assemblyFormat = "$operand attr-dict `:` type($operand)"; 
 }
 
-def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
+def EmitC_MulOp : EmitC_BinaryOp<"mul", []> {
   let summary = "Multiplication operation";
   let description = [{
     With the `emitc.mul` operation the arithmetic operator * (multiplication) can
@@ -977,7 +992,7 @@ def EmitC_MulOp : EmitC_BinaryOp<"mul", [CExpression]> {
   let results = (outs FloatIntegerIndexOrOpaqueType);
 }
 
-def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
+def EmitC_RemOp : EmitC_BinaryOp<"rem", []> {
   let summary = "Remainder operation";
   let description = [{
     With the `emitc.rem` operation the arithmetic operator % (remainder) can
@@ -999,7 +1014,7 @@ def EmitC_RemOp : EmitC_BinaryOp<"rem", [CExpression]> {
   let results = (outs IntegerIndexOrOpaqueType);
 }
 
-def EmitC_SubOp : EmitC_BinaryOp<"sub", [CExpression]> {
+def EmitC_SubOp : EmitC_BinaryOp<"sub", []> {
   let summary = "Subtraction operation";
   let description = [{
     With the `emitc.sub` operation the arithmetic operator - (subtraction) can
@@ -1069,7 +1084,7 @@ def EmitC_MemberOfPtrOp : EmitC_Op<"member_of_ptr"> {
 }
 
 def EmitC_ConditionalOp : EmitC_Op<"conditional",
-    [AllTypesMatch<["true_value", "false_value", "result"]>, CExpression]> {
+    [AllTypesMatch<["true_value", "false_value", "result"]>, CExpressionInterface]> {
   let summary = "Conditional (ternary) operation";
   let description = [{
     With the `emitc.conditional` operation the ternary conditional operator can
@@ -1096,9 +1111,15 @@ def EmitC_ConditionalOp : EmitC_Op<"conditional",
   let arguments = (ins I1:$condition, EmitCType:$true_value, EmitCType:$false_value);
   let results = (outs EmitCType:$result);
   let assemblyFormat = "operands attr-dict `:` type($result)";
+
+  let extraClassDeclaration = [{
+    bool hasSideEffects() {
+      return false;
+    }
+  }];
 }
 
-def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
+def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", []> {
   let summary = "Unary minus operation";
   let description = [{
     With the `emitc.unary_minus` operation the unary operator - (minus) can be
@@ -1116,7 +1137,7 @@ def EmitC_UnaryMinusOp : EmitC_UnaryOp<"unary_minus", [CExpression]> {
   }];
 }
 
-def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", [CExpression]> {
+def EmitC_UnaryPlusOp : EmitC_UnaryOp<"unary_plus", []> {
   let summary = "Unary plus operation";
   let description = [{
     With the `emitc.unary_plus` operation the unary operator + (plus) can be
@@ -1304,7 +1325,7 @@ def EmitC_VerbatimOp : EmitC_Op<"verbatim"> {
     FailureOr<SmallVector<::mlir::emitc::ReplacementItem>> parseFormatString();
   }];
 
-  let arguments = (ins StrAttr:$value, Variadic<EmitCType>:$fmtArgs);
+  let arguments = (ins StrAttr:$value, Variadic<AnyTypeOf<[EmitCType, EmitC_LValueType]>>:$fmtArgs);
 
   let builders = [OpBuilder<(ins "::mlir::StringAttr":$value),
                             [{ build($_builder, $_state, value, {}); }]>];
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
new file mode 100644
index 0000000000000..51efe76aceb5c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.h
@@ -0,0 +1,31 @@
+//===- EmitCInterfaces.h - EmitC interfaces definitions ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares C++ classes for some of the interfaces used in the EmitC
+// dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
+#define MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace emitc {
+//
+} // namespace emitc
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// EmitC Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h.inc"
+
+#endif // MLIR_DIALECT_EMITC_IR_EMITCINTERFACES_H
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
new file mode 100644
index 0000000000000..777784e56202a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitCInterfaces.td
@@ -0,0 +1,48 @@
+//===- EmitCInterfaces.td - EmitC Interfaces ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the interfaces used by EmitC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
+#define MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def CExpressionInterface : OpInterface<"CExpressionInterface"> {
+  let description = [{
+    Interface to mark operations that can be part of the CExpression.
+  }];
+
+  let cppNamespace = "::mlir::emitc";
+  let methods = [
+    InterfaceMethod<[{
+      Check whether operation has side effects that may affect the expression
+      evaluation.
+
+      By default operation is marked as having side effects.
+
+      ```c++
+      class ConcreteOp ... {
+      public:
+        bool hasSideEffects() {
+          // That way we can override the default implementation.
+          return false;
+        }
+      };
+      ```
+    }],
+      "bool", "hasSideEffects", (ins), /*methodBody=*/[{}],
+       /*defaultImplementation=*/[{
+        return true;
+    }]>,
+  ];
+}
+
+#endif // MLIR_DIALECT_EMITC_IR_EMITCINTERFACES
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h b/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h
deleted file mode 100644
index c1602dfce4b48..0000000000000
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitCTraits.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- EmitCTraits.h - EmitC trait definitions ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares C++ classes for some of the traits used in the EmitC
-// dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
-#define MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
-
-#include "mlir/IR/OpDefinition.h"
-
-namespace mlir {
-namespace OpTrait {
-namespace emitc {
-
-template <typename ConcreteType>
-class CExpression : public TraitBase<ConcreteType, CExpression> {};
-
-} // namespace emitc
-} // namespace OpTrait
-} // namespace mlir
-
-#endif // MLIR_DIALECT_EMITC_IR_EMITCTRAITS_H
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 2824f09dab6ce..138170f8c8762 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -168,42 +168,6 @@ def NonNegFlagInterface : OpInterface<"NonNegFlagInterface"> {
   ];
 }
 
-def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> {
-  let description = [{
-    An interface for operations that can carry branch weights metadata. It
-    provides setters and getters for the operation's branch weights attribute.
-    The default implementation of the interface methods expect the operation to
-    have an attribute of type DenseI32ArrayAttr named branch_weights.
-  }];
-
-  let cppNamespace = "::mlir::LLVM";
-
-  let methods = [
-    InterfaceMethod<
-      /*desc=*/        "Returns the branch weights attribute or nullptr",
-      /*returnType=*/  "::mlir::DenseI32ArrayAttr",
-      /*methodName=*/  "getBranchWeightsOrNull",
-      /*args=*/        (ins),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        return op.getBranchWeightsAttr();
-      }]
-      >,
-    InterfaceMethod<
-      /*desc=*/        "Sets the branch weights attribute",
-      /*returnType=*/  "void",
-      /*methodName=*/  "setBranchWeights",
-      /*args=*/        (ins "::mlir::DenseI32ArrayAttr":$attr),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        op.setBranchWeightsAttr(attr);
-      }]
-      >
-  ];
-}
-
 def AccessGroupOpInterface : OpInterface<"AccessGroupOpInterface"> {
   let description = [{
     An interface for memory operations that can carry access groups metadata.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index eda1d544cd81c..939e7a09a73ad 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -660,12 +660,12 @@ def LLVM_FPTruncOp : LLVM_CastOp<"fptrunc", "FPTrunc",
                                  LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
 
 // Call-related operations.
-def LLVM_InvokeOp : LLVM_Op<"invoke", [
-                      AttrSizedOperandSegments,
-                      DeclareOpInterfaceMethods<BranchOpInterface>,
-                      DeclareOpInterfaceMethods<CallOpInterface>,
-                      DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-                      Terminator]> {
+def LLVM_InvokeOp
+    : LLVM_Op<"invoke", [AttrSizedOperandSegments,
+                         DeclareOpInterfaceMethods<BranchOpInterface>,
+                         DeclareOpInterfaceMethods<CallOpInterface>,
+                         DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                         Terminator]> {
   let arguments = (ins
                    OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
                    OptionalAttr<FlatSymbolRefAttr>:$callee,
@@ -734,12 +734,12 @@ def LLVM_VaArgOp : LLVM_Op<"va_arg"> {
 // CallOp
 //===----------------------------------------------------------------------===//
 
-def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
-                    [AttrSizedOperandSegments,
-                     DeclareOpInterfaceMethods<FastmathFlagsInterface>,
-                     DeclareOpInterfaceMethods<CallOpInterface>,
-                     DeclareOpInterfaceMethods<SymbolUserOpInterface>,
-                     DeclareOpInterfaceMethods<BranchWeightOpInterface>]> {
+def LLVM_CallOp
+    : LLVM_MemAccessOpBase<
+          "call", [AttrSizedOperandSegments,
+                   DeclareOpInterfaceMethods<FastmathFlagsInterface>,
+                   DeclareOpInterfaceMethods<CallOpInterface>,
+                   DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call to an LLVM function.";
   let description = [{
     In LLVM IR, functions may return either 0 or 1 value. LLVM IR dialect
@@ -788,21 +788,16 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
       OptionalAttr<FlatSymbolRefAttr>:$callee,
       Variadic<LLVM_Type>:$callee_operands,
       DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags,
-      OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
       DefaultValuedAttr<CConv, "CConv::C">:$CConv,
       DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind,
       OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
-      UnitAttr:$convergent,
-      UnitAttr:$no_unwind,
-      UnitAttr:$will_return,
+      UnitAttr:$convergent, UnitAttr:$no_unwind, UnitAttr:$will_return,
       VariadicOfVariadic<LLVM_Type, "op_bundle_sizes">:$op_bundle_operands,
       DenseI32ArrayAttr:$op_bundle_sizes,
       OptionalAttr<ArrayAttr>:$op_bundle_tags,
       OptionalAttr<DictArrayAttr>:$arg_attrs,
-      OptionalAttr<DictArrayAttr>:$res_attrs,
-      UnitAttr:$no_inline,
-      UnitAttr:$always_inline,
-      UnitAttr:$inline_hint);
+      OptionalAttr<DictArrayAttr>:$res_attrs, UnitAttr:$no_inline,
+      UnitAttr:$always_inline, UnitAttr:$inline_hint);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1047,11 +1042,12 @@ def LLVM_BrOp : LLVM_TerminatorOp<"br",
     LLVM_TerminatorPassthroughOpBuilder
   ];
 }
-def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_CondBrOp
+    : LLVM_TerminatorOp<
+          "cond_br", [AttrSizedOperandSegments,
+                      DeclareOpInterfaceMethods<BranchOpInterface>,
+                      DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                      Pure]> {
   let arguments = (ins I1:$condition,
                    Variadic<LLVM_Type>:$trueDestOperands,
                    Variadic<LLVM_Type>:$falseDestOperands,
@@ -1136,11 +1132,12 @@ def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable"> {
   }];
 }
 
-def LLVM_SwitchOp : LLVM_TerminatorOp<"switch",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_SwitchOp
+    : LLVM_TerminatorOp<
+          "switch", [AttrSizedOperandSegments,
+                     DeclareOpInterfaceMethods<BranchOpInterface>,
+                     DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                     Pure]> {
   let arguments = (ins
     AnySignlessInteger:$value,
     Variadic<AnyType>:$defaultOperands,
@@ -1829,7 +1826,7 @@ def LLVM_ComdatOp : LLVM_Op<"comdat", [NoTerminator, NoRegionArguments, SymbolTa
 }
 
 def LLVM_LLVMFuncOp : LLVM_Op<"func", [
-    AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
+    AffineScope, AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
   ]> {
   let summary = "LLVM dialect function.";
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 2ec61758ba8ef..8c4da9b2dce18 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1160,6 +1160,46 @@ def SPIRV_GLFMixOp :
 
 // -----
 
+def SPIRV_GLLengthOp : SPIRV_GLOp<"Length", 66, [
+    Pure,
+    TypesMatchWith<"result type must match operand element type",
+                  "operand", "result",
+                  "::mlir::getElementTypeOrSelf($_self)">
+  ]> {
+  let summary = "Return the length of a vector x";
+
+  let description = [{
+    Result is the length of vector x, i.e., sqrt(x[0]**2 + x[1]**2 + ...).
+
+    The operand x must be a scalar or vector whose component type is floating-point.
+
+    Result Type must be a scalar of the same type as the component type of x.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Length %0 : vector<3xf32> -> f32
+    %3 = spirv.GL.Length %1 : f32 -> f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_ScalarOrVectorOf<SPIRV_Float>:$operand
+  );
+
+  let results = (outs
+    SPIRV_Float:$result
+  );
+
+  let assemblyFormat = [{
+    $operand attr-dict `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
 def SPIRV_GLDistanceOp : SPIRV_GLOp<"Distance", 67, [
     Pure,
     AllTypesMatch<["p0", "p1"]>,
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
index 1a4733df3f187..a1ce4e252c2f4 100644
--- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -30,7 +30,7 @@ namespace tensor {
 // for _static_ dimensions.
 PadOp createPadHighOp(RankedTensorType resType, Value source, Value pad,
                       bool nofold, Location loc, OpBuilder &builder,
-                      SmallVector<Value> dynOutDims = {});
+                      ValueRange dynOutDims = std::nullopt);
 
 // Creates dim ops for each dynamic dimension of the ranked tensor argument and
 // returns these as values.
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 0aa750e625436..62e66b3dabee8 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -418,11 +418,14 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
         with options = { "top-down" = false,
                          "max-iterations" = %max_iter,
                          "test-convergence" = true,
-                         "max-num-rewrites" =  %max_rewrites }
+                         "max-num-rewrites" = %max_rewrites }
         to %module
     : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
     ```
 
+    Options' values which are `ArrayAttr`s are converted to comma-separated
+    lists of options. Likewise for params which associate multiple values.
+
     This op first looks for a pass pipeline with the specified name. If no such
     pipeline exists, it looks for a pass with the specified name. If no such
     pass exists either, this op fails definitely.
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
index 7f6967f11444f..d63800c12d132 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
@@ -142,6 +142,26 @@ LogicalResult verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                                             const SuccessorOperands &operands);
 } // namespace detail
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the branch weights attached to an operation
+/// implementing WeightedBranchOpInterface are correct.
+LogicalResult verifyBranchWeights(Operation *op);
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// WeightedRegiobBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the region weights attached to an operation
+/// implementing WeightedRegiobBranchOpInterface are correct.
+LogicalResult verifyRegionBranchWeights(Operation *op);
+} // namespace detail
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index 69bce78e946c8..46ab0b9ebbc6b 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -375,6 +375,118 @@ def SelectLikeOpInterface : OpInterface<"SelectLikeOpInterface"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+def WeightedBranchOpInterface : OpInterface<"WeightedBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for branching terminator
+    operations, i.e. terminator operations with successors.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a branch
+    is computed as the ratio between the branch's weight and the total
+    sum of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of successors of the operation.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the branch weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getBranchWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the branch weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setBranchWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyBranchWeights($_op);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+// TODO: the probabilities of entering a particular region seem
+// to correlate with the values returned by
+// RegionBranchOpInterface::invocationBounds(), and we should probably
+// verify that the values are consistent. In that case, should
+// WeightedRegionBranchOpInterface extend RegionBranchOpInterface?
+def WeightedRegionBranchOpInterface
+    : OpInterface<"WeightedRegionBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for region operations
+    that exhibit branching behavior between held regions.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a region is computed
+    as the ratio between the region branch's weight and the total sum
+    of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of regions held by the operation
+    (including empty regions).
+
+    The weights specify the probability of branching to a particular
+    region when first executing the operation.
+    For example, for loop-like operations with a single region
+    the weight specifies the probability of entering the loop.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the region weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getRegionWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the region weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setRegionWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyRegionBranchWeights($_op);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ControlFlow Traits
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Query/Matcher/Marshallers.h b/mlir/include/mlir/Query/Matcher/Marshallers.h
index 012bf7b9ec4a9..5fe6965f32efb 100644
--- a/mlir/include/mlir/Query/Matcher/Marshallers.h
+++ b/mlir/include/mlir/Query/Matcher/Marshallers.h
@@ -108,6 +108,9 @@ class MatcherDescriptor {
                                 const llvm::ArrayRef<ParserValue> args,
                                 Diagnostics *error) const = 0;
 
+  // If the matcher is variadic, it can take any number of arguments.
+  virtual bool isVariadic() const = 0;
+
   // Returns the number of arguments accepted by the matcher.
   virtual unsigned getNumArgs() const = 0;
 
@@ -140,6 +143,8 @@ class FixedArgCountMatcherDescriptor : public MatcherDescriptor {
     return marshaller(matcherFunc, matcherName, nameRange, args, error);
   }
 
+  bool isVariadic() const override { return false; }
+
   unsigned getNumArgs() const override { return argKinds.size(); }
 
   void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
@@ -153,6 +158,54 @@ class FixedArgCountMatcherDescriptor : public MatcherDescriptor {
   const std::vector<ArgKind> argKinds;
 };
 
+class VariadicOperatorMatcherDescriptor : public MatcherDescriptor {
+public:
+  using VarOp = DynMatcher::VariadicOperator;
+  VariadicOperatorMatcherDescriptor(unsigned minCount, unsigned maxCount,
+                                    VarOp varOp, StringRef matcherName)
+      : minCount(minCount), maxCount(maxCount), varOp(varOp),
+        matcherName(matcherName) {}
+
+  VariantMatcher create(SourceRange nameRange, ArrayRef<ParserValue> args,
+                        Diagnostics *error) const override {
+    if (args.size() < minCount || maxCount < args.size()) {
+      addError(error, nameRange, ErrorType::RegistryWrongArgCount,
+               {llvm::Twine("requires between "), llvm::Twine(minCount),
+                llvm::Twine(" and "), llvm::Twine(maxCount),
+                llvm::Twine(" args, got "), llvm::Twine(args.size())});
+      return VariantMatcher();
+    }
+
+    std::vector<VariantMatcher> innerArgs;
+    for (int64_t i = 0, e = args.size(); i != e; ++i) {
+      const ParserValue &arg = args[i];
+      const VariantValue &value = arg.value;
+      if (!value.isMatcher()) {
+        addError(error, arg.range, ErrorType::RegistryWrongArgType,
+                 {llvm::Twine(i + 1), llvm::Twine("matcher: "),
+                  llvm::Twine(value.getTypeAsString())});
+        return VariantMatcher();
+      }
+      innerArgs.push_back(value.getMatcher());
+    }
+    return VariantMatcher::VariadicOperatorMatcher(varOp, std::move(innerArgs));
+  }
+
+  bool isVariadic() const override { return true; }
+
+  unsigned getNumArgs() const override { return 0; }
+
+  void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
+    kinds.push_back(ArgKind(ArgKind::Matcher));
+  }
+
+private:
+  const unsigned minCount;
+  const unsigned maxCount;
+  const VarOp varOp;
+  const StringRef matcherName;
+};
+
 // Helper function to check if argument count matches expected count
 inline bool checkArgCount(SourceRange nameRange, size_t expectedArgCount,
                           llvm::ArrayRef<ParserValue> args,
@@ -224,6 +277,14 @@ makeMatcherAutoMarshall(ReturnType (*matcherFunc)(ArgTypes...),
       reinterpret_cast<void (*)()>(matcherFunc), matcherName, argKinds);
 }
 
+// Variadic operator overload.
+template <unsigned MinCount, unsigned MaxCount>
+std::unique_ptr<MatcherDescriptor>
+makeMatcherAutoMarshall(VariadicOperatorMatcherFunc<MinCount, MaxCount> func,
+                        StringRef matcherName) {
+  return std::make_unique<VariadicOperatorMatcherDescriptor>(
+      MinCount, MaxCount, func.varOp, matcherName);
+}
 } // namespace mlir::query::matcher::internal
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MARSHALLERS_H
diff --git a/mlir/include/mlir/Query/Matcher/MatchFinder.h b/mlir/include/mlir/Query/Matcher/MatchFinder.h
index f8abf20ef60bb..6d06ca13d1344 100644
--- a/mlir/include/mlir/Query/Matcher/MatchFinder.h
+++ b/mlir/include/mlir/Query/Matcher/MatchFinder.h
@@ -21,7 +21,9 @@
 
 namespace mlir::query::matcher {
 
-/// A class that provides utilities to find operations in the IR.
+/// Finds and collects matches from the IR. After construction
+/// `collectMatches` can be used to traverse the IR and apply
+/// matchers.
 class MatchFinder {
 
 public:
diff --git a/mlir/include/mlir/Query/Matcher/MatchersInternal.h b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
index 183b2514e109f..88109430b6feb 100644
--- a/mlir/include/mlir/Query/Matcher/MatchersInternal.h
+++ b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
@@ -8,11 +8,11 @@
 //
 // Implements the base layer of the matcher framework.
 //
-// Matchers are methods that return a Matcher which provides a method one of the
-// following methods: match(Operation *op), match(Operation *op,
-// SetVector<Operation *> &matchedOps)
+// Matchers are methods that return a Matcher which provides a
+// `match(...)` method whose parameters define the context of the match.
+// Support includes simple (unary) matchers as well as matcher combinators
+// (anyOf, allOf, etc.)
 //
-// The matcher functions are defined in include/mlir/IR/Matchers.h.
 // This file contains the wrapper classes needed to construct matchers for
 // mlir-query.
 //
@@ -25,6 +25,15 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
 namespace mlir::query::matcher {
+class DynMatcher;
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+
+} // namespace internal
 
 // Defaults to false if T has no match() method with the signature:
 // match(Operation* op).
@@ -84,6 +93,27 @@ class MatcherFnImpl : public MatcherInterface {
   MatcherFn matcherFn;
 };
 
+// VariadicMatcher takes a vector of Matchers and returns true if any Matchers
+// match the given operation.
+using VariadicOperatorFunction = bool (*)(Operation *op,
+                                          SetVector<Operation *> *matchedOps,
+                                          ArrayRef<DynMatcher> innerMatchers);
+
+template <VariadicOperatorFunction Func>
+class VariadicMatcher : public MatcherInterface {
+public:
+  VariadicMatcher(std::vector<DynMatcher> matchers)
+      : matchers(std::move(matchers)) {}
+
+  bool match(Operation *op) override { return Func(op, nullptr, matchers); }
+  bool match(Operation *op, SetVector<Operation *> &matchedOps) override {
+    return Func(op, &matchedOps, matchers);
+  }
+
+private:
+  std::vector<DynMatcher> matchers;
+};
+
 // Matcher wraps a MatcherInterface implementation and provides match()
 // methods that redirect calls to the underlying implementation.
 class DynMatcher {
@@ -92,6 +122,31 @@ class DynMatcher {
   DynMatcher(MatcherInterface *implementation)
       : implementation(implementation) {}
 
+  // Construct from a variadic function.
+  enum VariadicOperator {
+    // Matches operations for which all provided matchers match.
+    AllOf,
+    // Matches operations for which at least one of the provided matchers
+    // matches.
+    AnyOf
+  };
+
+  static std::unique_ptr<DynMatcher>
+  constructVariadic(VariadicOperator Op,
+                    std::vector<DynMatcher> innerMatchers) {
+    switch (Op) {
+    case AllOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::allOfVariadicOperator>(
+              std::move(innerMatchers)));
+    case AnyOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::anyOfVariadicOperator>(
+              std::move(innerMatchers)));
+    }
+    llvm_unreachable("Invalid Op value.");
+  }
+
   template <typename MatcherFn>
   static std::unique_ptr<DynMatcher>
   constructDynMatcherFromMatcherFn(MatcherFn &matcherFn) {
@@ -113,6 +168,59 @@ class DynMatcher {
   std::string functionName;
 };
 
+// VariadicOperatorMatcher related types.
+template <typename... Ps>
+class VariadicOperatorMatcher {
+public:
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp, Ps &&...params)
+      : varOp(varOp), params(std::forward<Ps>(params)...) {}
+
+  operator std::unique_ptr<DynMatcher>() const & {
+    return DynMatcher::constructVariadic(
+        varOp, getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+  operator std::unique_ptr<DynMatcher>() && {
+    return DynMatcher::constructVariadic(
+        varOp, std::move(*this).getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+private:
+  // Helper method to unpack the tuple into a vector.
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) const & {
+    return {DynMatcher(std::get<Is>(params))...};
+  }
+
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) && {
+    return {DynMatcher(std::get<Is>(std::move(params)))...};
+  }
+
+  const DynMatcher::VariadicOperator varOp;
+  std::tuple<Ps...> params;
+};
+
+// Overloaded function object to generate VariadicOperatorMatcher objects from
+// arbitrary matchers.
+template <unsigned MinCount, unsigned MaxCount>
+struct VariadicOperatorMatcherFunc {
+  DynMatcher::VariadicOperator varOp;
+
+  template <typename... Ms>
+  VariadicOperatorMatcher<Ms...> operator()(Ms &&...Ps) const {
+    static_assert(MinCount <= sizeof...(Ms) && sizeof...(Ms) <= MaxCount,
+                  "invalid number of parameters for variadic matcher");
+    return VariadicOperatorMatcher<Ms...>(varOp, std::forward<Ms>(Ps)...);
+  }
+};
+
+namespace internal {
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    anyOf = {DynMatcher::AnyOf};
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    allOf = {DynMatcher::AllOf};
+} // namespace internal
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MATCHERSINTERNAL_H
diff --git a/mlir/include/mlir/Query/Matcher/SliceMatchers.h b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
index 441205b3a9615..7181648f06f89 100644
--- a/mlir/include/mlir/Query/Matcher/SliceMatchers.h
+++ b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides matchers for MLIRQuery that peform slicing analysis
+// This file defines slicing-analysis matchers that extend and abstract the
+// core implementations from `SliceAnalysis.h`.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,9 +17,9 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/IR/Operation.h"
 
-/// A matcher encapsulating `getBackwardSlice` method from SliceAnalysis.h.
-/// Additionally, it limits the slice computation to a certain depth level using
-/// a custom filter.
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. The traversal stops once the desired depth level
+/// is reached.
 ///
 /// Example: starting from node 9, assuming the matcher
 /// computes the slice for the first two depth levels:
@@ -119,6 +120,77 @@ bool BackwardSliceMatcher<Matcher>::matches(
                            : backwardSlice.size() >= 1;
 }
 
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateBackwardSliceMatcher {
+public:
+  PredicateBackwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                                bool inclusive, bool omitBlockArguments,
+                                bool omitUsesFromAbove)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive),
+        omitBlockArguments(omitBlockArguments),
+        omitUsesFromAbove(omitUsesFromAbove) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &backwardSlice) {
+    backwardSlice.clear();
+    BackwardSliceOptions options;
+    options.inclusive = inclusive;
+    options.omitUsesFromAbove = omitUsesFromAbove;
+    options.omitBlockArguments = omitBlockArguments;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      LogicalResult result = getBackwardSlice(rootOp, &backwardSlice, options);
+      assert(result.succeeded() && "expected backward slice to succeed");
+      (void)result;
+      return options.inclusive ? backwardSlice.size() > 1
+                               : backwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+  bool omitBlockArguments;
+  bool omitUsesFromAbove;
+};
+
+/// Computes the forward-slice of all users reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateForwardSliceMatcher {
+public:
+  PredicateForwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                               bool inclusive)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &forwardSlice) {
+    forwardSlice.clear();
+    ForwardSliceOptions options;
+    options.inclusive = inclusive;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      getForwardSlice(rootOp, &forwardSlice, options);
+      return options.inclusive ? forwardSlice.size() > 1
+                               : forwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+};
+
 /// Matches transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher>
@@ -130,7 +202,7 @@ m_GetDefinitions(Matcher innerMatcher, int64_t maxDepth, bool inclusive,
                                        omitUsesFromAbove);
 }
 
-/// Matches all transitive defs of a top-level operation up to N levels
+/// Matches all transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                                          int64_t maxDepth) {
@@ -139,6 +211,28 @@ inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                        false, false);
 }
 
+/// Matches all transitive defs of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateBackwardSliceMatcher<BaseMatcher, Filter>
+m_GetDefinitionsByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                            bool inclusive, bool omitBlockArguments,
+                            bool omitUsesFromAbove) {
+  return PredicateBackwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive,
+      omitBlockArguments, omitUsesFromAbove);
+}
+
+/// Matches all users of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateForwardSliceMatcher<BaseMatcher, Filter>
+m_GetUsersByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                      bool inclusive) {
+  return PredicateForwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive);
+}
+
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHERS_SLICEMATCHERS_H
diff --git a/mlir/include/mlir/Query/Matcher/VariantValue.h b/mlir/include/mlir/Query/Matcher/VariantValue.h
index 98c0a18e25101..1a47576de1841 100644
--- a/mlir/include/mlir/Query/Matcher/VariantValue.h
+++ b/mlir/include/mlir/Query/Matcher/VariantValue.h
@@ -26,7 +26,12 @@ enum class ArgKind { Boolean, Matcher, Signed, String };
 // A variant matcher object to abstract simple and complex matchers into a
 // single object type.
 class VariantMatcher {
-  class MatcherOps;
+  class MatcherOps {
+  public:
+    std::optional<DynMatcher>
+    constructVariadicOperator(DynMatcher::VariadicOperator varOp,
+                              ArrayRef<VariantMatcher> innerMatchers) const;
+  };
 
   // Payload interface to be specialized by each matcher type. It follows a
   // similar interface as VariantMatcher itself.
@@ -43,6 +48,9 @@ class VariantMatcher {
 
   // Clones the provided matcher.
   static VariantMatcher SingleMatcher(DynMatcher matcher);
+  static VariantMatcher
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                          ArrayRef<VariantMatcher> args);
 
   // Makes the matcher the "null" matcher.
   void reset();
@@ -61,6 +69,7 @@ class VariantMatcher {
       : value(std::move(value)) {}
 
   class SinglePayload;
+  class VariadicOpPayload;
 
   std::shared_ptr<const Payload> value;
 };
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 97ae14aa0d6af..0f136c5c46d79 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -189,7 +189,7 @@ class ModuleTranslation {
                                   llvm::Instruction *inst);
 
   /// Sets LLVM profiling metadata for operations that have branch weights.
-  void setBranchWeightsMetadata(BranchWeightOpInterface op);
+  void setBranchWeightsMetadata(WeightedBranchOpInterface op);
 
   /// Sets LLVM loop metadata for branch operations that have a loop annotation
   /// attribute.
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index debfd003bd5b5..d31d7d801e149 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -166,10 +166,15 @@ struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
                           TypeRange(adaptor.getFalseDestOperands()));
     if (failed(convertedFalseBlock))
       return failure();
-    Operation *newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
         op, adaptor.getCondition(), *convertedTrueBlock,
         adaptor.getTrueDestOperands(), *convertedFalseBlock,
         adaptor.getFalseDestOperands());
+    ArrayRef<int32_t> weights = op.getWeights();
+    if (!weights.empty()) {
+      newOp.setWeights(weights);
+      op.removeBranchWeightsAttr();
+    }
     // TODO: We should not just forward all attributes like that. But there are
     // existing Flang tests that depend on this behavior.
     newOp->setAttrs(op->getAttrDictionary());
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index ade4e4d3de8ec..8ccf1bfc292d5 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -28,6 +28,9 @@
 #include "llvm/Support/MathExtras.h"
 #include <optional>
 
+#define DEBUG_TYPE "memref-to-llvm"
+#define DBGS() llvm::dbgs() << "[" DEBUG_TYPE "] "
+
 namespace mlir {
 #define GEN_PASS_DEF_FINALIZEMEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
@@ -1782,12 +1785,22 @@ matchSimpleAtomicOp(memref::AtomicRMWOp atomicOp) {
   case arith::AtomicRMWKind::assign:
     return LLVM::AtomicBinOp::xchg;
   case arith::AtomicRMWKind::maximumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw maximumf changed "
+                         "from fmax to fmaximum, expect more NaNs");
+    return LLVM::AtomicBinOp::fmaximum;
+  case arith::AtomicRMWKind::maxnumf:
     return LLVM::AtomicBinOp::fmax;
   case arith::AtomicRMWKind::maxs:
     return LLVM::AtomicBinOp::max;
   case arith::AtomicRMWKind::maxu:
     return LLVM::AtomicBinOp::umax;
   case arith::AtomicRMWKind::minimumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw minimum changed "
+                         "from fmin to fminimum, expect more NaNs");
+    return LLVM::AtomicBinOp::fminimum;
+  case arith::AtomicRMWKind::minnumf:
     return LLVM::AtomicBinOp::fmin;
   case arith::AtomicRMWKind::mins:
     return LLVM::AtomicBinOp::min;
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
index 35205a6ca2eee..c0ef28c648ac5 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
@@ -84,126 +84,130 @@ static SmallVector<Value> computeStrides(Location loc, RewriterBase &rewriter,
   return result;
 }
 
+LogicalResult
+affine::lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                      AffineDelinearizeIndexOp op) {
+  Location loc = op.getLoc();
+  Value linearIdx = op.getLinearIndex();
+  unsigned numResults = op.getNumResults();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numResults == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  if (numResults == 1) {
+    rewriter.replaceOp(op, linearIdx);
+    return success();
+  }
+
+  SmallVector<Value> results;
+  results.reserve(numResults);
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/true);
+
+  Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
+
+  Value initialPart =
+      rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
+  results.push_back(initialPart);
+
+  auto emitModTerm = [&](Value stride) -> Value {
+    Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
+    Value remainderNegative = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt, remainder, zero);
+    // If the correction is relevant, this term is <= stride, which is known
+    // to be positive in `index`. Otherwise, while 2 * stride might overflow,
+    // this branch won't be taken, so the risk of `poison` is fine.
+    Value corrected = rewriter.create<arith::AddIOp>(
+        loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
+    Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
+                                                 corrected, remainder);
+    return mod;
+  };
+
+  // Generate all the intermediate parts
+  for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
+    Value thisStride = strides[i];
+    Value nextStride = strides[i + 1];
+    Value modulus = emitModTerm(thisStride);
+    // We know both inputs are positive, so floorDiv == div.
+    // This could potentially be a divui, but it's not clear if that would
+    // cause issues.
+    Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
+    results.push_back(divided);
+  }
+
+  results.push_back(emitModTerm(strides.back()));
+
+  rewriter.replaceOp(op, results);
+  return success();
+}
+
+LogicalResult affine::lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                                  AffineLinearizeIndexOp op) {
+  // Should be folded away, included here for safety.
+  if (op.getMultiIndex().empty()) {
+    rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
+    return success();
+  }
+
+  Location loc = op.getLoc();
+  ValueRange multiIndex = op.getMultiIndex();
+  size_t numIndexes = multiIndex.size();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numIndexes == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/op.getDisjoint());
+  SmallVector<std::pair<Value, int64_t>> scaledValues;
+  scaledValues.reserve(numIndexes);
+
+  // Note: strides doesn't contain a value for the final element (stride 1)
+  // and everything else lines up. We use the "mutable" accessor so we can get
+  // our hands on an `OpOperand&` for the loop invariant counting function.
+  for (auto [stride, idxOp] :
+       llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
+    Value scaledIdx = rewriter.create<arith::MulIOp>(
+        loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
+    int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
+    scaledValues.emplace_back(scaledIdx, numHoistableLoops);
+  }
+  scaledValues.emplace_back(
+      multiIndex.back(),
+      numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
+
+  // Sort by how many enclosing loops there are, ties implicitly broken by
+  // size of the stride.
+  llvm::stable_sort(scaledValues,
+                    [&](auto l, auto r) { return l.second > r.second; });
+
+  Value result = scaledValues.front().first;
+  for (auto [scaledValue, numHoistableLoops] : llvm::drop_begin(scaledValues)) {
+    std::ignore = numHoistableLoops;
+    result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
+                                            arith::IntegerOverflowFlags::nsw);
+  }
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
 namespace {
-/// Lowers `affine.delinearize_index` into a sequence of division and remainder
-/// operations.
 struct LowerDelinearizeIndexOps
     : public OpRewritePattern<AffineDelinearizeIndexOp> {
   using OpRewritePattern<AffineDelinearizeIndexOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineDelinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value linearIdx = op.getLinearIndex();
-    unsigned numResults = op.getNumResults();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numResults == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    if (numResults == 1) {
-      rewriter.replaceOp(op, linearIdx);
-      return success();
-    }
-
-    SmallVector<Value> results;
-    results.reserve(numResults);
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/true);
-
-    Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
-
-    Value initialPart =
-        rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
-    results.push_back(initialPart);
-
-    auto emitModTerm = [&](Value stride) -> Value {
-      Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
-      Value remainderNegative = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, remainder, zero);
-      // If the correction is relevant, this term is <= stride, which is known
-      // to be positive in `index`. Otherwise, while 2 * stride might overflow,
-      // this branch won't be taken, so the risk of `poison` is fine.
-      Value corrected = rewriter.create<arith::AddIOp>(
-          loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
-      Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
-                                                   corrected, remainder);
-      return mod;
-    };
-
-    // Generate all the intermediate parts
-    for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
-      Value thisStride = strides[i];
-      Value nextStride = strides[i + 1];
-      Value modulus = emitModTerm(thisStride);
-      // We know both inputs are positive, so floorDiv == div.
-      // This could potentially be a divui, but it's not clear if that would
-      // cause issues.
-      Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
-      results.push_back(divided);
-    }
-
-    results.push_back(emitModTerm(strides.back()));
-
-    rewriter.replaceOp(op, results);
-    return success();
+    return affine::lowerAffineDelinearizeIndexOp(rewriter, op);
   }
 };
 
-/// Lowers `affine.linearize_index` into a sequence of multiplications and
-/// additions. Make a best effort to sort the input indices so that
-/// the most loop-invariant terms are at the left of the additions
-/// to enable loop-invariant code motion.
 struct LowerLinearizeIndexOps final : OpRewritePattern<AffineLinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineLinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    // Should be folded away, included here for safety.
-    if (op.getMultiIndex().empty()) {
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
-      return success();
-    }
-
-    Location loc = op.getLoc();
-    ValueRange multiIndex = op.getMultiIndex();
-    size_t numIndexes = multiIndex.size();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numIndexes == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/op.getDisjoint());
-    SmallVector<std::pair<Value, int64_t>> scaledValues;
-    scaledValues.reserve(numIndexes);
-
-    // Note: strides doesn't contain a value for the final element (stride 1)
-    // and everything else lines up. We use the "mutable" accessor so we can get
-    // our hands on an `OpOperand&` for the loop invariant counting function.
-    for (auto [stride, idxOp] :
-         llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
-      Value scaledIdx = rewriter.create<arith::MulIOp>(
-          loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
-      int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
-      scaledValues.emplace_back(scaledIdx, numHoistableLoops);
-    }
-    scaledValues.emplace_back(
-        multiIndex.back(),
-        numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
-
-    // Sort by how many enclosing loops there are, ties implicitly broken by
-    // size of the stride.
-    llvm::stable_sort(scaledValues,
-                      [&](auto l, auto r) { return l.second > r.second; });
-
-    Value result = scaledValues.front().first;
-    for (auto [scaledValue, numHoistableLoops] :
-         llvm::drop_begin(scaledValues)) {
-      std::ignore = numHoistableLoops;
-      result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
-                                              arith::IntegerOverflowFlags::nsw);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
+    return affine::lowerAffineLinearizeIndexOp(rewriter, op);
   }
 };
 
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 1709654b90138..e602210c2dc6c 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
-#include "mlir/Dialect/EmitC/IR/EmitCTraits.h"
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -386,9 +386,7 @@ OpFoldResult emitc::ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
 Operation *ExpressionOp::getRootOp() {
   auto yieldOp = cast<YieldOp>(getBody()->getTerminator());
   Value yieldedValue = yieldOp.getResult();
-  Operation *rootOp = yieldedValue.getDefiningOp();
-  assert(rootOp && "Yielded value not defined within expression");
-  return rootOp;
+  return yieldedValue.getDefiningOp();
 }
 
 LogicalResult ExpressionOp::verify() {
@@ -406,13 +404,21 @@ LogicalResult ExpressionOp::verify() {
   if (!yieldResult)
     return emitOpError("must yield a value at termination");
 
+  Operation *rootOp = yieldResult.getDefiningOp();
+
+  if (!rootOp)
+    return emitOpError("yielded value has no defining op");
+
+  if (rootOp->getParentOp() != getOperation())
+    return emitOpError("yielded value not defined within expression");
+
   Type yieldType = yieldResult.getType();
 
   if (resultType != yieldType)
     return emitOpError("requires yielded type to match return type");
 
   for (Operation &op : region.front().without_terminator()) {
-    if (!op.hasTrait<OpTrait::emitc::CExpression>())
+    if (!isa<emitc::CExpressionInterface>(op))
       return emitOpError("contains an unsupported operation");
     if (op.getNumResults() != 1)
       return emitOpError("requires exactly one result for each operation");
@@ -1398,5 +1404,7 @@ void FileOp::build(OpBuilder &builder, OperationState &state, StringRef id) {
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/EmitC/IR/EmitCInterfaces.cpp.inc"
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/EmitC/IR/EmitC.cpp.inc"
diff --git a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
index 224d68ab8b4a6..2f3e2618f4d74 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
@@ -36,7 +36,7 @@ struct FormExpressionsPass
     // Wrap each C operator op with an expression op.
     OpBuilder builder(context);
     auto matchFun = [&](Operation *op) {
-      if (op->hasTrait<OpTrait::emitc::CExpression>() &&
+      if (isa<emitc::CExpressionInterface>(*op) &&
           !op->getParentOfType<emitc::ExpressionOp>() &&
           op->getNumResults() == 1)
         createExpression(op, builder);
diff --git a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
index 87350ecdceaaa..a578a86b499a6 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
@@ -16,8 +16,7 @@ namespace mlir {
 namespace emitc {
 
 ExpressionOp createExpression(Operation *op, OpBuilder &builder) {
-  assert(op->hasTrait<OpTrait::emitc::CExpression>() &&
-         "Expected a C expression");
+  assert(isa<emitc::CExpressionInterface>(op) && "Expected a C expression");
 
   // Create an expression yielding the value returned by op.
   assert(op->getNumResults() == 1 && "Expected exactly one result");
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index c7528c970a4ba..a12aef0dfad38 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -589,10 +589,6 @@ LogicalResult SwitchOp::verify() {
            static_cast<int64_t>(getCaseDestinations().size())))
     return emitOpError("expects number of case values to match number of "
                        "case destinations");
-  if (getBranchWeights() && getBranchWeights()->size() != getNumSuccessors())
-    return emitError("expects number of branch weights to match number of "
-                     "successors: ")
-           << getBranchWeights()->size() << " vs " << getNumSuccessors();
   if (getCaseValues() &&
       getValue().getType() != getCaseValues()->getElementType())
     return emitError("expects case value type to match condition value type");
@@ -962,7 +958,6 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
   assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
         /*var_callee_type=*/nullptr, callee, args, /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -992,7 +987,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, args,
         /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr, /*CConv=*/nullptr,
+        /*CConv=*/nullptr,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -1009,7 +1004,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType),
         /*callee=*/nullptr, args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
@@ -1025,7 +1020,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
index e8d2274d29aa0..39fbab8f37a2e 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
@@ -75,3 +75,11 @@ def ConvertComparisonIntoClamp2_#CmpClampPair[0] : Pat<
         )),
     (CmpClampPair[1] $input, $min, $max)>;
 }
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length -> spirv.GL.FAbs
+//===----------------------------------------------------------------------===//
+
+def ConvertGLLengthToGLFAbs : Pat<
+    (SPIRV_GLLengthOp SPIRV_Float:$operand),
+    (SPIRV_GLFAbsOp $operand)>;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
index 3ad8057a58dc9..46acb8c156fc6 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
@@ -34,8 +34,8 @@ void populateSPIRVGLCanonicalizationPatterns(RewritePatternSet &results) {
               ConvertComparisonIntoClamp2_SPIRV_SLessThanOp,
               ConvertComparisonIntoClamp2_SPIRV_SLessThanEqualOp,
               ConvertComparisonIntoClamp2_SPIRV_ULessThanOp,
-              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp>(
-      results.getContext());
+              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp,
+              ConvertGLLengthToGLFAbs>(results.getContext());
 }
 } // namespace spirv
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 11ae0108594dd..289296a07d9d3 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -24,8 +24,7 @@ using namespace mlir::tensor;
 
 PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source,
                                     Value pad, bool nofold, Location loc,
-                                    OpBuilder &b,
-                                    SmallVector<Value> dynOutDims) {
+                                    OpBuilder &b, ValueRange dynOutDims) {
 
   // This assumption simplifies the following logic without limiting what's
   // required _today_. If needed, we can relax it in the future.
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 582d082153bef..bb9bdd70625e4 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -788,46 +788,47 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
   // Obtain a single options-string to pass to the pass(-pipeline) from options
   // passed in as a dictionary of keys mapping to values which are either
   // attributes or param-operands pointing to attributes.
+  OperandRange dynamicOptions = getDynamicOptions();
 
   std::string options;
   llvm::raw_string_ostream optionsStream(options); // For "printing" attrs.
 
-  OperandRange dynamicOptions = getDynamicOptions();
-  for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) {
-    if (idx > 0)
-      optionsStream << " "; // Interleave options separator.
-    optionsStream << namedAttribute.getName().str(); // Append the key.
-    optionsStream << "="; // And the key-value separator.
-
-    Attribute valueAttrToAppend;
-    if (auto paramOperandIndex =
-            dyn_cast<transform::ParamOperandAttr>(namedAttribute.getValue())) {
-      // The corresponding value attribute is passed in via a param.
+  // A helper to convert an option's attribute value into a corresponding
+  // string representation, with the ability to obtain the attr(s) from a param.
+  std::function<void(Attribute)> appendValueAttr = [&](Attribute valueAttr) {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
+      // The corresponding value attribute(s) is/are passed in via a param.
       // Obtain the param-operand via its specified index.
-      size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt();
+      size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       assert(dynamicOptionIdx < dynamicOptions.size() &&
-             "number of dynamic option markers (UnitAttr) in options ArrayAttr "
+             "the number of ParamOperandAttrs in the options DictionaryAttr"
              "should be the same as the number of options passed as params");
-      ArrayRef<Attribute> dynamicOption =
+      ArrayRef<Attribute> attrsAssociatedToParam =
           state.getParams(dynamicOptions[dynamicOptionIdx]);
-      if (dynamicOption.size() != 1)
-        return emitSilenceableError()
-               << "options passed as a param must have "
-                  "a single value associated, param "
-               << dynamicOptionIdx << " associates " << dynamicOption.size();
-      valueAttrToAppend = dynamicOption[0];
-    } else {
-      // Value is a static attribute.
-      valueAttrToAppend = namedAttribute.getValue();
-    }
-
-    // Append string representation of value attribute.
-    if (auto strAttr = dyn_cast<StringAttr>(valueAttrToAppend)) {
+      // Recursive so as to append all attrs associated to the param.
+      llvm::interleave(attrsAssociatedToParam, optionsStream, appendValueAttr,
+                       ",");
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recursive so as to append all nested attrs of the array.
+      llvm::interleave(arrayAttr, optionsStream, appendValueAttr, ",");
+    } else if (auto strAttr = dyn_cast<StringAttr>(valueAttr)) {
+      // Convert to unquoted string.
       optionsStream << strAttr.getValue().str();
     } else {
-      valueAttrToAppend.print(optionsStream, /*elideType=*/true);
+      // For all other attributes, ask the attr to print itself (without type).
+      valueAttr.print(optionsStream, /*elideType=*/true);
     }
-  }
+  };
+
+  // Convert the options DictionaryAttr into a single string.
+  llvm::interleave(
+      getOptions(), optionsStream,
+      [&](auto namedAttribute) {
+        optionsStream << namedAttribute.getName().str(); // Append the key.
+        optionsStream << "="; // And the key-value separator.
+        appendValueAttr(namedAttribute.getValue()); // And the attr's str repr.
+      },
+      " ");
   optionsStream.flush();
 
   // Get pass or pass pipeline from registry.
@@ -878,23 +879,30 @@ static ParseResult parseApplyRegisteredPassOptions(
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions) {
   // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax.
   SmallVector<NamedAttribute> keyValuePairs;
-
   size_t dynamicOptionsIdx = 0;
-  auto parseKeyValuePair = [&]() -> ParseResult {
-    // Parse items of the form `key = value` where `key` is a bare identifier or
-    // a string and `value` is either an attribute or an operand.
 
-    std::string key;
-    Attribute valueAttr;
-    if (parser.parseOptionalKeywordOrString(&key))
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected key to either be an identifier or a string";
-    if (key.empty())
-      return failure();
+  // Helper for allowing parsing of option values which can be of the form:
+  // - a normal attribute
+  // - an operand (which would be converted to an attr referring to the operand)
+  // - ArrayAttrs containing the foregoing (in correspondence with ListOptions)
+  std::function<ParseResult(Attribute &)> parseValue =
+      [&](Attribute &valueAttr) -> ParseResult {
+    // Allow for array syntax, e.g. `[0 : i64, %param, true, %other_param]`:
+    if (succeeded(parser.parseOptionalLSquare())) {
+      SmallVector<Attribute> attrs;
 
-    if (parser.parseEqual())
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected '=' after key in key-value pair";
+      // Recursively parse the array's elements, which might be operands.
+      if (parser.parseCommaSeparatedList(
+              AsmParser::Delimiter::None,
+              [&]() -> ParseResult { return parseValue(attrs.emplace_back()); },
+              " in options dictionary") ||
+          parser.parseRSquare())
+        return failure(); // NB: Attempted parse should've output error message.
+
+      valueAttr = ArrayAttr::get(parser.getContext(), attrs);
+
+      return success();
+    }
 
     // Parse the value, which can be either an attribute or an operand.
     OptionalParseResult parsedValueAttr =
@@ -903,9 +911,7 @@ static ParseResult parseApplyRegisteredPassOptions(
       OpAsmParser::UnresolvedOperand operand;
       ParseResult parsedOperand = parser.parseOperand(operand);
       if (failed(parsedOperand))
-        return parser.emitError(parser.getCurrentLocation())
-               << "expected a valid attribute or operand as value associated "
-               << "to key '" << key << "'";
+        return failure(); // NB: Attempted parse should've output error message.
       // To make use of the operand, we need to store it in the options dict.
       // As SSA-values cannot occur in attributes, what we do instead is store
       // an attribute in its place that contains the index of the param-operand,
@@ -924,7 +930,30 @@ static ParseResult parseApplyRegisteredPassOptions(
              << "in the generic print format";
     }
 
+    return success();
+  };
+
+  // Helper for `key = value`-pair parsing where `key` is a bare identifier or a
+  // string and `value` looks like either an attribute or an operand-in-an-attr.
+  std::function<ParseResult()> parseKeyValuePair = [&]() -> ParseResult {
+    std::string key;
+    Attribute valueAttr;
+
+    if (failed(parser.parseOptionalKeywordOrString(&key)) || key.empty())
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected key to either be an identifier or a string";
+
+    if (failed(parser.parseEqual()))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected '=' after key in key-value pair";
+
+    if (failed(parseValue(valueAttr)))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected a valid attribute or operand as value associated "
+             << "to key '" << key << "'";
+
     keyValuePairs.push_back(NamedAttribute(key, valueAttr));
+
     return success();
   };
 
@@ -951,16 +980,27 @@ static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
   if (options.empty())
     return;
 
-  printer << "{";
-  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
-    printer << namedAttribute.getName() << " = ";
-    Attribute value = namedAttribute.getValue();
-    if (auto indexAttr = dyn_cast<transform::ParamOperandAttr>(value)) {
+  std::function<void(Attribute)> printOptionValue = [&](Attribute valueAttr) {
+    if (auto paramOperandAttr =
+            dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
       // Resolve index of param-operand to its actual SSA-value and print that.
-      printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]);
+      printer.printOperand(
+          dynamicOptions[paramOperandAttr.getIndex().getInt()]);
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // This case is so that ArrayAttr-contained operands are pretty-printed.
+      printer << "[";
+      llvm::interleaveComma(arrayAttr, printer, printOptionValue);
+      printer << "]";
     } else {
-      printer.printAttribute(value);
+      printer.printAttribute(valueAttr);
     }
+  };
+
+  printer << "{";
+  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
+    printer << namedAttribute.getName();
+    printer << " = ";
+    printOptionValue(namedAttribute.getValue());
   });
   printer << "}";
 }
@@ -970,9 +1010,11 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
   // and references to dynamic options in the options dictionary.
 
   auto dynamicOptions = SmallVector<Value>(getDynamicOptions());
-  for (NamedAttribute namedAttr : getOptions())
-    if (auto paramOperand =
-            dyn_cast<transform::ParamOperandAttr>(namedAttr.getValue())) {
+
+  // Helper for option values to mark seen operands as having been seen (once).
+  std::function<LogicalResult(Attribute)> checkOptionValue =
+      [&](Attribute valueAttr) -> LogicalResult {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
       size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size())
         return emitOpError()
@@ -983,8 +1025,20 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
         return emitOpError() << "dynamic option index " << dynamicOptionIdx
                              << " is already used in options";
       dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used.
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recurse into ArrayAttrs as they may contain references to operands.
+      for (auto eltAttr : arrayAttr)
+        if (failed(checkOptionValue(eltAttr)))
+          return failure();
     }
+    return success();
+  };
+
+  for (NamedAttribute namedAttr : getOptions())
+    if (failed(checkOptionValue(namedAttr.getValue())))
+      return failure();
 
+  // All dynamicOptions-params seen in the dict will have been set to null.
   for (Value dynamicOption : dynamicOptions)
     if (dynamicOption)
       return emitOpError() << "a param operand does not have a corresponding "
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 7dbb7a334fe62..384717aeca665 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -886,17 +886,31 @@ class RewriteScalarExtractOfTransferRead
     SmallVector<Value> newIndices(xferOp.getIndices().begin(),
                                   xferOp.getIndices().end());
     for (auto [i, pos] : llvm::enumerate(extractOp.getMixedPosition())) {
-      assert(isa<Attribute>(pos) && "Unexpected non-constant index");
-      int64_t offset = cast<IntegerAttr>(cast<Attribute>(pos)).getInt();
       int64_t idx = newIndices.size() - extractOp.getNumIndices() + i;
-      OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
-          rewriter, extractOp.getLoc(),
-          rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
-      if (auto value = dyn_cast<Value>(ofr)) {
+
+      // Compute affine expression `newIndices[idx] + pos` where `pos` can be
+      // either a constant or a value.
+      OpFoldResult composedIdx;
+      if (auto attr = dyn_cast<Attribute>(pos)) {
+        int64_t offset = cast<IntegerAttr>(attr).getInt();
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(),
+            rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
+      } else {
+        Value dynamicOffset = cast<Value>(pos);
+        AffineExpr sym0, sym1;
+        bindSymbols(rewriter.getContext(), sym0, sym1);
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(), sym0 + sym1,
+            {newIndices[idx], dynamicOffset});
+      }
+
+      // Update the corresponding index with the folded result.
+      if (auto value = dyn_cast<Value>(composedIdx)) {
         newIndices[idx] = value;
       } else {
         newIndices[idx] = rewriter.create<arith::ConstantIndexOp>(
-            extractOp.getLoc(), *getConstantIntValue(ofr));
+            extractOp.getLoc(), *getConstantIntValue(composedIdx));
       }
     }
     if (isa<MemRefType>(xferOp.getBase().getType())) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 885477fe4cbd5..0457f8128b908 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -396,11 +396,304 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
   }
 };
 
+struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
+  using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getType();
+    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
+    VectorType indiceVecTy = indiceVec.getType();
+
+    if (!tdescTy.isScattered())
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
+    if (originalChunkSize > 1)
+      targetIndiceShape.pop_back();
+
+    auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
+    SmallVector<Type> convertedIndiceTypes =
+        getUnrolledTypes(indiceVecTy, targetIndiceShape);
+    SmallVector<Value> convertedIndiceVec =
+        pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+
+    // More indices is need when chunkSize > 1. Since a big load from one
+    // address could be break into multiple small loads.
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto [indice, indiceType] :
+           llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          // Compute the offset
+          Value inc = rewriter.create<arith::ConstantIndexOp>(
+              loc, i * blockedChunkSize);
+          Value incVec = rewriter.create<vector::SplatOp>(loc, indiceType, inc);
+          Value offsetIndice =
+              rewriter.create<arith::AddIOp>(loc, indice, incVec);
+
+          auto newOp = rewriter.create<xegpu::CreateDescOp>(
+              loc, newTdescTy, op.getSource(), offsetIndice);
+
+          newOps.push_back(newOp);
+        }
+      }
+    } else {
+      for (auto indice : convertedIndiceVec) {
+        auto newOp = rewriter.create<xegpu::CreateDescOp>(
+            loc, newTdescTy, op.getSource(), indice);
+        newOps.push_back(newOp);
+      }
+    }
+
+    Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+
+    return success();
+  }
+};
+
+struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
+  using UnrollPattern<xegpu::LoadGatherOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::LoadGatherOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    if (!tdescTy.isScattered())
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<int64_t> targetMaskShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    Type elemTy = tdescTy.getElementType();
+    VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      targetMaskShape.pop_back();
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, targetMaskShape, loc, rewriter);
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i)
+          convertedMasks.push_back(mask);
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+      newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      convertedMasks = pack(op.getMask(), convertedMaskTypes, targetMaskShape,
+                            loc, rewriter);
+    }
+
+    SmallVector<Value> newOps;
+    for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
+      auto newOp = rewriter.create<xegpu::LoadGatherOp>(
+          loc, newValueTy, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+      newOps.push_back(newOp);
+    }
+
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
+struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
+  using UnrollPattern<xegpu::PrefetchOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::PrefetchOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    if (!tdescTy.isScattered())
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    for (auto t : convertedTdesc)
+      rewriter.create<xegpu::PrefetchOp>(loc, TypeRange(), t, op->getAttrs());
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
+  using UnrollPattern<xegpu::StoreScatterOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::StoreScatterOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    if (!tdescTy.isScattered())
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+      convertedMaskTypes = getUnrolledTypes(maskTy, (*targetShape)[0]);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, (*targetShape)[0], loc, rewriter);
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedMasks.push_back(mask);
+        }
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, *targetShape);
+      convertedMasks =
+          pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    }
+
+    SmallVector<Type> convertedValTypes =
+        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<Value> convertedValues =
+        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
+
+    for (size_t i = 0; i < convertedValues.size(); ++i) {
+      Value v = convertedValues[i];
+      Value t = convertedTdescs[i];
+      Value m = op.getMask() ? convertedMasks[i] : nullptr;
+      rewriter.create<xegpu::StoreScatterOp>(
+          loc, v, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+    }
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
+  using UnrollPattern<xegpu::UpdateOffsetOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    if (tdescTy.getRank() > 2)
+      return failure();
+
+    if (!tdescTy.isScattered())
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
+    VectorType offsetVecTy = offsetVec.getType();
+    SmallVector<Type> convertedOffsetTypes;
+    SmallVector<Value> convertedOffsetVec;
+    SmallVector<Value> newOps;
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    if (originalChunkSize > 1) {
+      SmallVector<int64_t> shape1D(targetShape->begin(),
+                                   targetShape->end() - 1);
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, shape1D);
+      SmallVector<Value> convertedOffsetVec1D =
+          pack(offsetVec, convertedOffsetTypes, shape1D, loc, rewriter);
+
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto offset : convertedOffsetVec1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedOffsetVec.push_back(offset);
+        }
+      }
+
+    } else {
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape);
+      convertedOffsetVec =
+          pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
+    }
+
+    for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
+      auto newOp =
+          rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
+      newOps.push_back(newOp);
+    }
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::xegpu::populateXeGPUUnrollPatterns(
     RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
   patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
-               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp>(
-      patterns.getContext(), options);
+               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
+               UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
+               UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
+                                                       options);
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index a26c6b52f0ddc..e3563d10bc6f1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -8,10 +8,12 @@
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -19,6 +21,7 @@
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include <optional>
 
 namespace mlir {
 namespace xegpu {
@@ -328,6 +331,65 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
+// This pattern transforms elementwise ops to work at subgroup level.
+struct WgToSgElementwiseOp : public ConversionPattern {
+  WgToSgElementwiseOp(MLIRContext *ctx)
+      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only match ops with elementwise trait and single result.
+    if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)
+      return failure();
+
+    auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
+    assert(resultType && "Expected result to be a VectorType");
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+
+    xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+    if (!layout || !layout.getSgLayout())
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+
+    size_t numVariants = operands.empty() ? 0 : operands.front().size();
+
+    if (llvm::any_of(operands, [&](const ValueRange &operandVec) {
+          return operandVec.size() != numVariants;
+        }))
+      return failure();
+
+    SmallVector<Value> newResults;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+
+    for (size_t i = 0; i < numVariants; ++i) {
+      SmallVector<Value> opOperands;
+      for (auto &operandVec : operands)
+        opOperands.push_back(operandVec[i]);
+
+      OperationState state(op->getLoc(), op->getName());
+      state.addOperands(opOperands);
+      state.addTypes(newResultType);
+      // Copy all attributes, but update "layout_result_0" to drop
+      // sgLayout/sgData
+      for (auto attr : op->getAttrs()) {
+        if (auto layout = dyn_cast<xegpu::LayoutAttr>(attr.getValue()))
+          state.addAttribute(attr.getName(), layout.dropSgLayoutAndData());
+        else
+          state.addAttribute(attr.getName(), attr.getValue());
+      }
+      Operation *newOp = rewriter.create(state);
+      newResults.push_back(newOp->getResult(0));
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newResults});
+    return success();
+  }
+};
+
 // Handles UnrealizedConversionCastOp generated during
 // SCFStructuralTypeConversions (step 1). This op may appear as either a
 // target or source materialization for Vector values, e.g.:
@@ -411,7 +473,8 @@ namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
                WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
-               UnrealizedConversionCastOpPattern>(patterns.getContext());
+               UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
+      patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -518,6 +581,30 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
     return isLegal(layout);
   });
 
+  target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+      [=](Operation *op) -> std::optional<bool> {
+        // Only handle elementwise mappable ops
+        if (!OpTrait::hasElementwiseMappableTraits(op))
+          return true;
+
+        VectorType resultType =
+            dyn_cast<VectorType>(op->getResult(0).getType());
+        if (!resultType)
+          return true;
+
+        // Check if all operands are vectors of the same shape
+        // TODO: Support other types.
+        for (Value operand : op->getOperands()) {
+          VectorType operandType = dyn_cast<VectorType>(operand.getType());
+          if (!operandType || operandType.getShape() != resultType.getShape()) {
+            return true;
+          }
+        }
+
+        xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+        return isLegal(layout);
+      });
+
   target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
       [=](UnrealizedConversionCastOp op) {
         return llvm::is_contained(existingCastOps, op.getOperation());
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 2ae334b517a31..3a63db35eec0f 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -80,6 +81,51 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyWeights(Operation *op,
+                                   llvm::ArrayRef<int32_t> weights,
+                                   std::size_t expectedWeightsNum,
+                                   llvm::StringRef weightAnchorName,
+                                   llvm::StringRef weightRefName) {
+  if (weights.empty())
+    return success();
+
+  if (weights.size() != expectedWeightsNum)
+    return op->emitError() << "expects number of " << weightAnchorName
+                           << " weights to match number of " << weightRefName
+                           << ": " << weights.size() << " vs "
+                           << expectedWeightsNum;
+
+  for (auto [index, weight] : llvm::enumerate(weights))
+    if (weight < 0)
+      return op->emitError() << "weight #" << index << " must be non-negative";
+
+  if (llvm::all_of(weights, [](int32_t value) { return value == 0; }))
+    return op->emitError() << "branch weights cannot all be zero";
+
+  return success();
+}
+
+LogicalResult detail::verifyBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumSuccessors(), "branch",
+                       "successors");
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+LogicalResult detail::verifyRegionBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedRegionBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumRegions(), "region", "regions");
+}
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Query/Matcher/CMakeLists.txt b/mlir/lib/Query/Matcher/CMakeLists.txt
index 629479bf7adc1..ba202762fdfbb 100644
--- a/mlir/lib/Query/Matcher/CMakeLists.txt
+++ b/mlir/lib/Query/Matcher/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRQueryMatcher
   MatchFinder.cpp
+  MatchersInternal.cpp
   Parser.cpp
   RegistryManager.cpp
   VariantValue.cpp
diff --git a/mlir/lib/Query/Matcher/MatchersInternal.cpp b/mlir/lib/Query/Matcher/MatchersInternal.cpp
new file mode 100644
index 0000000000000..01f412ade846b
--- /dev/null
+++ b/mlir/lib/Query/Matcher/MatchersInternal.cpp
@@ -0,0 +1,33 @@
+//===--- MatchersInternal.cpp----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Query/Matcher/MatchersInternal.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir::query::matcher {
+
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::all_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::any_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+} // namespace internal
+} // namespace mlir::query::matcher
diff --git a/mlir/lib/Query/Matcher/RegistryManager.cpp b/mlir/lib/Query/Matcher/RegistryManager.cpp
index 4b511c5f009e7..08b610453b11a 100644
--- a/mlir/lib/Query/Matcher/RegistryManager.cpp
+++ b/mlir/lib/Query/Matcher/RegistryManager.cpp
@@ -64,7 +64,7 @@ std::vector<ArgKind> RegistryManager::getAcceptedCompletionTypes(
     unsigned argNumber = ctxEntry.second;
     std::vector<ArgKind> nextTypeSet;
 
-    if (argNumber < ctor->getNumArgs())
+    if (ctor->isVariadic() || argNumber < ctor->getNumArgs())
       ctor->getArgKinds(argNumber, nextTypeSet);
 
     typeSet.insert(nextTypeSet.begin(), nextTypeSet.end());
@@ -83,7 +83,7 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
     const internal::MatcherDescriptor &matcher = *m.getValue();
     llvm::StringRef name = m.getKey();
 
-    unsigned numArgs = matcher.getNumArgs();
+    unsigned numArgs = matcher.isVariadic() ? 1 : matcher.getNumArgs();
     std::vector<std::vector<ArgKind>> argKinds(numArgs);
 
     for (const ArgKind &kind : acceptedTypes) {
@@ -115,6 +115,9 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
       }
     }
 
+    if (matcher.isVariadic())
+      os << ",...";
+
     os << ")";
     typedText += "(";
 
diff --git a/mlir/lib/Query/Matcher/VariantValue.cpp b/mlir/lib/Query/Matcher/VariantValue.cpp
index 1cb2d48f9d56f..7bf4774dba830 100644
--- a/mlir/lib/Query/Matcher/VariantValue.cpp
+++ b/mlir/lib/Query/Matcher/VariantValue.cpp
@@ -27,12 +27,64 @@ class VariantMatcher::SinglePayload : public VariantMatcher::Payload {
   DynMatcher matcher;
 };
 
+class VariantMatcher::VariadicOpPayload : public VariantMatcher::Payload {
+public:
+  VariadicOpPayload(DynMatcher::VariadicOperator varOp,
+                    std::vector<VariantMatcher> args)
+      : varOp(varOp), args(std::move(args)) {}
+
+  std::optional<DynMatcher> getDynMatcher() const override {
+    std::vector<DynMatcher> dynMatchers;
+    for (auto variantMatcher : args) {
+      std::optional<DynMatcher> dynMatcher = variantMatcher.getDynMatcher();
+      if (dynMatcher)
+        dynMatchers.push_back(dynMatcher.value());
+    }
+    auto result = DynMatcher::constructVariadic(varOp, dynMatchers);
+    return *result;
+  }
+
+  std::string getTypeAsString() const override {
+    std::string inner;
+    llvm::interleave(
+        args, [&](auto const &arg) { inner += arg.getTypeAsString(); },
+        [&] { inner += " & "; });
+    return inner;
+  }
+
+private:
+  const DynMatcher::VariadicOperator varOp;
+  const std::vector<VariantMatcher> args;
+};
+
 VariantMatcher::VariantMatcher() = default;
 
 VariantMatcher VariantMatcher::SingleMatcher(DynMatcher matcher) {
   return VariantMatcher(std::make_shared<SinglePayload>(std::move(matcher)));
 }
 
+VariantMatcher
+VariantMatcher::VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                                        ArrayRef<VariantMatcher> args) {
+  return VariantMatcher(
+      std::make_shared<VariadicOpPayload>(varOp, std::move(args)));
+}
+
+std::optional<DynMatcher> VariantMatcher::MatcherOps::constructVariadicOperator(
+    DynMatcher::VariadicOperator varOp,
+    ArrayRef<VariantMatcher> innerMatchers) const {
+  std::vector<DynMatcher> dynMatchers;
+  for (const auto &innerMatcher : innerMatchers) {
+    if (!innerMatcher.value)
+      return std::nullopt;
+    std::optional<DynMatcher> inner = innerMatcher.value->getDynMatcher();
+    if (!inner)
+      return std::nullopt;
+    dynMatchers.push_back(*inner);
+  }
+  return *DynMatcher::constructVariadic(varOp, dynMatchers);
+}
+
 std::optional<DynMatcher> VariantMatcher::getDynMatcher() const {
   return value ? value->getDynMatcher() : std::nullopt;
 }
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 803284d6df86a..637e1f3cdef87 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -10,6 +10,7 @@
 #include "QueryParser.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Query/Matcher/MatchFinder.h"
 #include "mlir/Query/QuerySession.h"
 #include "llvm/ADT/SetVector.h"
@@ -68,6 +69,8 @@ static Operation *extractFunction(std::vector<Operation *> &ops,
   // Clone operations and build function body
   std::vector<Operation *> clonedOps;
   std::vector<Value> clonedVals;
+  // TODO: Handle extraction of operations with compute payloads defined via
+  // regions.
   for (Operation *slicedOp : slice) {
     Operation *clonedOp =
         clonedOps.emplace_back(builder.clone(*slicedOp, mapper));
@@ -129,6 +132,8 @@ LogicalResult MatchQuery::run(llvm::raw_ostream &os, QuerySession &qs) const {
         finder.flattenMatchedOps(matches);
     Operation *function =
         extractFunction(flattenedMatches, rootOp->getContext(), functionName);
+    if (failed(verify(function)))
+      return mlir::failure();
     os << "\n" << *function << "\n\n";
     function->erase();
     return mlir::success();
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 5abc112ab8c7a..067a0470b14e4 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -329,9 +329,9 @@ static bool shouldBeInlined(ExpressionOp expressionOp) {
   if (hasDeferredEmission(user))
     return false;
 
-  // Do not inline expressions used by ops with the CExpression trait. If this
-  // was intended, the user could have been merged into the expression op.
-  return !user->hasTrait<OpTrait::emitc::CExpression>();
+  // Do not inline expressions used by ops with the CExpressionInterface. If
+  // this was intended, the user could have been merged into the expression op.
+  return !isa<emitc::CExpressionInterface>(*user);
 }
 
 static LogicalResult printConstantOp(CppEmitter &emitter, Operation *operation,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 1b5ce868b5c77..e67aa892afe09 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -146,8 +146,15 @@ static LogicalResult setProfilingAttr(OpBuilder &builder, llvm::MDNode *node,
     branchWeights.push_back(branchWeight->getZExtValue());
   }
 
-  if (auto iface = dyn_cast<BranchWeightOpInterface>(op)) {
-    iface.setBranchWeights(builder.getDenseI32ArrayAttr(branchWeights));
+  if (auto iface = dyn_cast<WeightedBranchOpInterface>(op)) {
+    // LLVM allows attaching a single weight to call instructions.
+    // This is used for carrying the execution count information
+    // in PGO modes. MLIR WeightedBranchOpInterface does not allow this,
+    // so we drop the metadata in this case.
+    // LLVM should probably use the VP form of MD_prof metadata
+    // for such cases.
+    if (op->getNumSuccessors() != 0)
+      iface.setWeights(branchWeights);
     return success();
   }
   return failure();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6bccc1d6f5d30..90ce06a0345c0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2294,8 +2294,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
     if (!privateVarOrErr)
       return handleError(privateVarOrErr, *taskOp.getOperation());
 
-    llvm::IRBuilderBase::InsertPointGuard guard(builder);
-    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+    setInsertPointForPossiblyEmptyBlock(builder);
 
     // TODO: this is a bit of a hack for Fortran character boxes.
     // Character boxes are passed by value into the init region and then the
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e5ca147ea98f8..3eaa24eb5c95b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1055,7 +1055,7 @@ LogicalResult ModuleTranslation::convertBlockImpl(Block &bb,
       return failure();
 
     // Set the branch weight metadata on the translated instruction.
-    if (auto iface = dyn_cast<BranchWeightOpInterface>(op))
+    if (auto iface = dyn_cast<WeightedBranchOpInterface>(op))
       setBranchWeightsMetadata(iface);
   }
 
@@ -2026,14 +2026,15 @@ void ModuleTranslation::setDereferenceableMetadata(
   inst->setMetadata(kindId, derefSizeNode);
 }
 
-void ModuleTranslation::setBranchWeightsMetadata(BranchWeightOpInterface op) {
-  DenseI32ArrayAttr weightsAttr = op.getBranchWeightsOrNull();
-  if (!weightsAttr)
+void ModuleTranslation::setBranchWeightsMetadata(WeightedBranchOpInterface op) {
+  SmallVector<uint32_t> weights;
+  llvm::transform(op.getWeights(), std::back_inserter(weights),
+                  [](int32_t value) { return static_cast<uint32_t>(value); });
+  if (weights.empty())
     return;
 
   llvm::Instruction *inst = isa<CallOp>(op) ? lookupCall(op) : lookupBranch(op);
   assert(inst && "expected the operation to have a mapping to an instruction");
-  SmallVector<uint32_t> weights(weightsAttr.asArrayRef());
   inst->setMetadata(
       llvm::LLVMContext::MD_prof,
       llvm::MDBuilder(getLLVMContext()).createBranchWeights(weights));
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index d258bfd852961..56c64f38fe29a 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -446,6 +446,19 @@ LogicalResult Serializer::processType(Location loc, Type type,
 LogicalResult
 Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
                             SetVector<StringRef> &serializationCtx) {
+
+  // Map unsigned integer types to singless integer types.
+  // This is needed otherwise the generated spirv assembly will contain
+  // twice a type declaration (like OpTypeInt 32 0) which is no permitted and
+  // such module fails validation. Indeed at MLIR level the two types are
+  // different and lookup in the cache below misses.
+  // Note: This conversion needs to happen here before the type is looked up in
+  // the cache.
+  if (type.isUnsignedInteger()) {
+    type = IntegerType::get(loc->getContext(), type.getIntOrFloatBitWidth(),
+                            IntegerType::SignednessSemantics::Signless);
+  }
+
   typeID = getTypeID(type);
   if (typeID)
     return success();
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index bfe96b1b3e5d4..b075919d1ef0f 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -219,6 +219,11 @@ def __init__(
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+OptionValueTypes = Union[
+    Sequence["OptionValueTypes"], Attribute, Value, Operation, OpView, str, int, bool
+]
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
     def __init__(
@@ -227,12 +232,7 @@ def __init__(
         target: Union[Operation, Value, OpView],
         pass_name: Union[str, StringAttr],
         *,
-        options: Optional[
-            Dict[
-                Union[str, StringAttr],
-                Union[Attribute, Value, Operation, OpView, str, int, bool],
-            ]
-        ] = None,
+        options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
         loc=None,
         ip=None,
     ):
@@ -243,26 +243,32 @@ def __init__(
         context = (loc and loc.context) or Context.current
 
         cur_param_operand_idx = 0
-        for key, value in options.items() if options is not None else {}:
-            if isinstance(key, StringAttr):
-                key = key.value
 
+        def option_value_to_attr(value):
+            nonlocal cur_param_operand_idx
             if isinstance(value, (Value, Operation, OpView)):
                 dynamic_options.append(_get_op_result_or_value(value))
-                options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context)
                 cur_param_operand_idx += 1
+                return ParamOperandAttr(cur_param_operand_idx - 1, context)
             elif isinstance(value, Attribute):
-                options_dict[key] = value
+                return value
             # The following cases auto-convert Python values to attributes.
             elif isinstance(value, bool):
-                options_dict[key] = BoolAttr.get(value)
+                return BoolAttr.get(value)
             elif isinstance(value, int):
                 default_int_type = IntegerType.get_signless(64, context)
-                options_dict[key] = IntegerAttr.get(default_int_type, value)
+                return IntegerAttr.get(default_int_type, value)
             elif isinstance(value, str):
-                options_dict[key] = StringAttr.get(value)
+                return StringAttr.get(value)
+            elif isinstance(value, Sequence):
+                return ArrayAttr.get([option_value_to_attr(elt) for elt in value])
             else:
                 raise TypeError(f"Unsupported option type: {type(value)}")
+
+        for key, value in options.items() if options is not None else {}:
+            if isinstance(key, StringAttr):
+                key = key.value
+            options_dict[key] = option_value_to_attr(value)
         super().__init__(
             result,
             _get_op_result_or_value(target),
@@ -279,12 +285,7 @@ def apply_registered_pass(
     target: Union[Operation, Value, OpView],
     pass_name: Union[str, StringAttr],
     *,
-    options: Optional[
-        Dict[
-            Union[str, StringAttr],
-            Union[Attribute, Value, Operation, OpView, str, int, bool],
-        ]
-    ] = None,
+    options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
     loc=None,
     ip=None,
 ) -> Value:
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index ac8b44f53aebf..89568e7766ae5 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -68,6 +68,7 @@ endif()
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
   LLVM_HAS_NVPTX_TARGET
+  LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   MLIR_ENABLE_BINDINGS_PYTHON
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
@@ -217,6 +218,11 @@ if(MLIR_ENABLE_BINDINGS_PYTHON)
   )
 endif()
 
+if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
+  list(APPEND MLIR_TEST_DEPENDS spirv-as)
+  list(APPEND MLIR_TEST_DEPENDS spirv-val)
+endif()
+
 # This target can be used to just build the dependencies
 # for the check-mlir target without executing the tests.
 # This is useful for bots when splitting the build step
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
index 9a0f2b7714544..7c78211d59010 100644
--- a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
+++ b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
@@ -67,3 +67,17 @@ func.func @unreachable_block() {
 ^bb1(%arg0: index):
   cf.br ^bb1(%arg0 : index)
 }
+
+// -----
+
+// Test case for cf.cond_br with weights.
+
+// CHECK-LABEL:   func.func @cf_cond_br_with_weights(
+func.func @cf_cond_br_with_weights(%cond: i1, %a: index, %b: index) -> index {
+// CHECK:           llvm.cond_br %{{.*}} weights([90, 10]), ^bb1(%{{.*}} : i64), ^bb2(%{{.*}} : i64)
+  cf.cond_br %cond, ^bb1(%a : index), ^bb2(%b : index) {branch_weights = array<i32: 90, 10>}
+^bb1(%arg1: index):
+  return %arg1 : index
+^bb2(%arg2: index):
+  return %arg2 : index
+}
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index acfc188574255..51d56389dac9e 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -452,11 +452,19 @@ func.func @atomic_rmw(%I : memref<10xi32>, %ival : i32, %F : memref<10xf32>, %fv
   // CHECK: llvm.atomicrmw umin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw addf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
   // CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maximumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmaximum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maxnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmax %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minimumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fminimum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw ori %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _or %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw andi %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _and %{{.*}}, %{{.*}} acq_rel
-  // CHECK-INTERFACE-COUNT-9: llvm.atomicrmw
+  // CHECK-INTERFACE-COUNT-13: llvm.atomicrmw
   return
 }
 
diff --git a/mlir/test/Dialect/ControlFlow/invalid.mlir b/mlir/test/Dialect/ControlFlow/invalid.mlir
index b51d8095c9974..1b8de22a9ff9f 100644
--- a/mlir/test/Dialect/ControlFlow/invalid.mlir
+++ b/mlir/test/Dialect/ControlFlow/invalid.mlir
@@ -67,3 +67,39 @@ func.func @switch_missing_default(%flag : i32, %caseOperand : i32) {
   ^bb3(%bb3arg : i32):
     return
 }
+
+// -----
+
+// CHECK-LABEL: func @wrong_weights_number
+func.func @wrong_weights_number(%cond: i1) {
+  // expected-error@+1 {{expects number of branch weights to match number of successors: 1 vs 2}}
+  cf.cond_br %cond weights([100]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_weight
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{weight #0 must be non-negative}}
+  cf.cond_br %cond weights([-1, 101]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @zero_weights
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{branch weights cannot all be zero}}
+  cf.cond_br %cond weights([0, 0]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Dialect/ControlFlow/ops.mlir b/mlir/test/Dialect/ControlFlow/ops.mlir
index c9317c7613972..160534240e0fa 100644
--- a/mlir/test/Dialect/ControlFlow/ops.mlir
+++ b/mlir/test/Dialect/ControlFlow/ops.mlir
@@ -51,3 +51,13 @@ func.func @switch_result_number(%arg0: i32) {
   ^bb2:
     return
 }
+
+// CHECK-LABEL: func @cond_weights
+func.func @cond_weights(%cond: i1) {
+// CHECK: cf.cond_br %{{.*}} weights([60, 40]), ^{{.*}}, ^{{.*}}
+  cf.cond_br %cond weights([60, 40]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 3793dfe3f173b..3946a36a83c6f 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -346,6 +346,28 @@ func.func @test_expression_multiple_results(%arg0: i32) -> i32 {
 
 // -----
 
+emitc.func @test_expression_no_defining_op(%a : i32) {
+  // expected-error @+1 {{'emitc.expression' op yielded value has no defining op}}
+  %res = emitc.expression : i32 {
+    emitc.yield %a : i32
+  }
+
+  return
+}
+
+// -----
+
+emitc.func @test_expression_op_outside_expression() {
+  %cond = literal "true" : i1
+  // expected-error @+1 {{'emitc.expression' op yielded value not defined within expression}}
+  %res = emitc.expression : i1 {
+    emitc.yield %cond : i1
+  }
+  return
+}
+
+// -----
+
 // expected-error @+1 {{'emitc.func' op requires zero or exactly one result, but has 2}}
 emitc.func @multiple_results(%0: i32) -> (i32, i32) {
   emitc.return %0 : i32
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 36d12e763afc7..ad40313f95df9 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -246,12 +246,20 @@ emitc.verbatim "typedef float f32;"
 // The value is not interpreted as format string if there are no operands.
 emitc.verbatim "{} {  }"
 
-func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32) {
+func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32, %arg2: !emitc.array<3x!emitc.ptr<i32>>) {
+  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
+
+  // Check that the lvalue type can be used by verbatim.
+  emitc.verbatim "++{};" args %a : !emitc.lvalue<i32>
+
+  // Check that the array type can be used by verbatim.
+  emitc.verbatim "*{}[0] = 1;" args %arg2 : !emitc.array<3x!emitc.ptr<i32>>
+
   emitc.verbatim "{} + {};" args %arg0, %arg1 : !emitc.ptr<i32>, i32
 
-  // Check there is no ambiguity whether %a is the argument to the emitc.verbatim op.
-  emitc.verbatim "a"
-  %a = "emitc.constant"(){value = 42 : i32} : () -> i32
+  // Check there is no ambiguity whether %b is the argument to the emitc.verbatim op.
+  emitc.verbatim "b"
+  %b = "emitc.constant"(){value = 42 : i32} : () -> i32
 
   return
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 642346cc40b0d..5c5d94c40e573 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -1000,3 +1000,69 @@ func.func @unpack_half_2x16_scalar_out(%arg0 : i32) -> () {
   %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> f32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+func.func @length(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : f32 -> f32
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  return
+}
+
+func.func @lengthvec(%arg0 : vector<3xf32>) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_in(%arg0 : i32) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : i32 -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16_in(%arg0 : f16) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : f16 -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32vec_in(%arg0 : vector<3xi32>) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'vector<3xi32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xi32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16vec_in(%arg0 : vector<3xf16>) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf16> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @length_vec_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> vector<3xf32>
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
index c1447b38f0a48..33b877667512e 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
@@ -177,3 +177,25 @@ func.func @clamp_ulessthanequal(%input: i32, %min: i32, %max: i32) -> i32 {
   // CHECK-NEXT: spirv.ReturnValue [[RES]]
   spirv.ReturnValue %2 : i32
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @convert_length_into_fabs_scalar
+func.func @convert_length_into_fabs_scalar(%arg0 : f32) -> f32 {
+  //CHECK: spirv.GL.FAbs {{%.*}} : f32
+  //CHECK-NOT: spirv.GL.Length
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  spirv.ReturnValue %0 : f32
+}
+
+// CHECK-LABEL: @dont_convert_length_into_fabs_vec
+func.func @dont_convert_length_into_fabs_vec(%arg0 : vector<3xf32>) -> f32 {
+  //CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  //CHECK-NOT: spirv.GL.FAbs
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  spirv.ReturnValue %0 : f32
+}
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 1d1be9eda3496..ce8f69c58701d 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -164,6 +164,128 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func private @valid_multiple_values_as_list_option_single_param()
+module {
+  func.func @valid_multiple_values_as_list_option_single_param() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    %multiple_symbol_names = transform.merge_handles %symbol_a, %symbol_b : !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_as_list_option()
+module {
+  func.func @valid_array_attr_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = ["a", "b"] } to %2
+        : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_param_as_list_option()
+module {
+  func.func @valid_array_attr_param_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %multiple_symbol_names = transform.param.constant ["a","b"] -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_multiple_params_as_single_list_option()
+module {
+  func.func @valid_multiple_params_as_single_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = [%symbol_a, %symbol_b] } to %2
+        : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @invalid_options_as_str() {
   return
 }
@@ -203,7 +325,8 @@ func.func @invalid_options_due_to_reserved_attr() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+3 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+2 {{expected a valid attribute or operand as value associated to key 'top-down'}}
     %2 = transform.apply_registered_pass "canonicalize"
         with options = { "top-down" = #transform.param_operand<index=0> } to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
@@ -262,26 +385,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @too_many_pass_option_params() {
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
-    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %x = transform.param.constant true -> !transform.any_param
-    %y = transform.param.constant false -> !transform.any_param
-    %topdown_options = transform.merge_handles %x, %y : !transform.any_param
-    // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
-    transform.apply_registered_pass "canonicalize"
-        with options = { "top-down" = %topdown_options } to %1
-        : (!transform.any_op, !transform.any_param) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
 module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
diff --git a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
index 52b0fdee184f6..7a1d6b3a8344a 100644
--- a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
+++ b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
@@ -148,3 +148,33 @@ func.func @subvector_extract(%m: memref<?x?xf32>, %idx: index) -> vector<16xf32>
   return %1 : vector<16xf32>
 }
 
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_1d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?xf32>, %[[M_IDX:.*]]: index, %[[E_IDX:.*]]: index
+//       CHECK:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[M_IDX]], %[[E_IDX]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[APPLY]]]
+func.func @transfer_read_1d_extract_dynamic(%m: memref<?xf32>, %idx: index,
+                                            %offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%idx], %cst {in_bounds = [true]} : memref<?xf32>, vector<5xf32>
+  %elem = vector.extract %vec[%offset] : f32 from vector<5xf32>
+  return %elem : f32
+}
+
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_2d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?x?xf32>, %[[ROW_IDX:.*]]: index, %[[COL_IDX:.*]]: index, %[[ROW_OFFSET:.*]]: index, %[[COL_OFFSET:.*]]: index
+//       CHECK:   %[[ROW_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[ROW_IDX]], %[[ROW_OFFSET]]]
+//       CHECK:   %[[COL_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[COL_IDX]], %[[COL_OFFSET]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[ROW_APPLY]], %[[COL_APPLY]]]
+func.func @transfer_read_2d_extract_dynamic(%m: memref<?x?xf32>, %row_idx: index, %col_idx: index,
+                                            %row_offset: index, %col_offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%row_idx, %col_idx], %cst {in_bounds = [true, true]} : memref<?x?xf32>, vector<10x5xf32>
+  %elem = vector.extract %vec[%row_offset, %col_offset] : f32 from vector<10x5xf32>
+  return %elem : f32
+}
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index b05c317231ad9..0a37ae70b5d99 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 // -----
-func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
   // expected-error@+1 {{Expecting the TensorDesc rank is up to 2 and not greater than the ranks of shape, strides, offsets or the memref source}}
   %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
@@ -9,49 +9,49 @@ func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
 
 // -----
 
-func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
   // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{SLM is not supported for 2D block tensor}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
   return
 }
 
 // -----
-func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
@@ -59,7 +59,7 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
+func.func @prefetch_nd_vc_2(%src: memref<24xf16>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex>
                 -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
@@ -70,7 +70,7 @@ func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
@@ -79,7 +79,7 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
+func.func @load_nd_vc_2(%src: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex>
           -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -90,7 +90,7 @@ func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // expected-warning@+1 {{Invalid Packed Attr.}}
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
@@ -99,7 +99,7 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
+func.func @load_nd_vc_4(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
@@ -110,7 +110,7 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
+func.func @load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -119,7 +119,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
+func.func @load_nd_simt(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8xf32>
@@ -127,7 +127,7 @@ func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_1(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
@@ -136,7 +136,7 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
+func.func @store_nd_vc_2(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = arith.constant dense<1.0>: vector<8x2xf16>
   %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
@@ -148,7 +148,7 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<2x24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // expected-error@+1 {{array length is not supported by store_nd}}
@@ -157,7 +157,7 @@ func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
+func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
   xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
@@ -165,7 +165,7 @@ func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
+func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -173,7 +173,7 @@ func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
+func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
@@ -182,7 +182,7 @@ func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
 }
 
 // -----
-func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
+func.func @update_nd_offset_1(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
             -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -192,7 +192,7 @@ func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_1(%src: ui64) {
+func.func @create_tdesc_vc_1(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16>
@@ -200,7 +200,7 @@ func.func @test_create_tdesc_vc_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_2(%src: ui64) {
+func.func @create_tdesc_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex>
   // expected-error@+1 {{expected chunk blocks for 2D tensor}}
@@ -209,7 +209,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_3(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
@@ -218,7 +218,7 @@ func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_4(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{invalid chunk size}}
@@ -227,7 +227,7 @@ func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_5(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{expected tensor shape[1] to match chunk size}}
@@ -236,7 +236,7 @@ func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
+func.func @create_tdesc_vc_6(%src: memref<?xf16>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
   // expected-error@+1 {{tensor shape[1] to be a multiple of packing factor 2}}
@@ -246,7 +246,7 @@ func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
 
 
 // -----
-func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
@@ -254,7 +254,7 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_vc_2(%src: ui64) {
+func.func @prefetch_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex>
           -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -264,7 +264,7 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_1(%src: ui64) {
+func.func @create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{expected layout rank to match tensor rank}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
@@ -272,7 +272,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_2(%src: ui64) {
+func.func @create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{cannot map over non-contiguous scattered row elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [1, 4], lane_data = [2, 1]>>
@@ -280,7 +280,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_3(%src: ui64) {
+func.func @create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{work item data mapping must match the number of contiguous elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
@@ -288,7 +288,7 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_simt_1(%src: ui64) {
+func.func @load_gather_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -298,7 +298,7 @@ func.func @test_load_gather_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_simt_1(%src: ui64) {
+func.func @store_scatter_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<6xf32>
@@ -309,7 +309,7 @@ func.func @test_store_scatter_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
+func.func @load_gather_vc_1(%src: memref<24x32xf16>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
@@ -319,7 +319,7 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_load_gather_vc_2(%src: ui64) {
+func.func @load_gather_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
@@ -332,7 +332,7 @@ func.func @test_load_gather_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
+func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
@@ -343,7 +343,7 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_scatter_vc_2(%src: ui64) {
+func.func @store_scatter_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
@@ -356,49 +356,49 @@ func.func @test_store_scatter_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{expecting lhs and result to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}}
   %1 = xegpu.dpas %a, %b : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{M-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<16x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
+func.func @dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
   // expected-error@+1 {{N-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
+func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
   // expected-error@+1 {{Expecting B operand to be a multiple of 32 bits}}
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<15xf16> -> vector<8xf32>
   return
 }
 
 // -----
-func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
+func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
   // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
@@ -512,7 +512,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
 }
 
 // -----
-func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected different srcMap and resMap}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
@@ -520,7 +520,7 @@ func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
 }
 
 // -----
-func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
new file mode 100644
index 0000000000000..7f3ebec225cdf
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  gpu.return
+}
+
+gpu.func @convert_layout(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 76af59d6aedc7..054c4d12fdb28 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -6,23 +6,15 @@
 
 // CHECK-LABEL: gpu.module @test {
 gpu.module @test {
-// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+// CHECK: gpu.func @create_nd_tdesc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
@@ -30,94 +22,41 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
-  //CHECK: %[[C:.*]] = arith.constant 1 : index
-  %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
+gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
+// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+// CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
@@ -125,17 +64,9 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
@@ -144,8 +75,8 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
@@ -154,8 +85,8 @@ gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
@@ -163,8 +94,8 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
@@ -172,8 +103,8 @@ gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -181,8 +112,8 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
@@ -190,8 +121,8 @@ gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
@@ -199,8 +130,8 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
@@ -208,8 +139,8 @@ gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
@@ -217,8 +148,8 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
@@ -226,8 +157,8 @@ gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
@@ -235,8 +166,8 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -245,8 +176,8 @@ gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
@@ -254,8 +185,8 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -264,8 +195,8 @@ gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
@@ -273,8 +204,8 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
@@ -282,8 +213,8 @@ gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -293,8 +224,8 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
   %1 = arith.constant dense<1.0>: vector<48xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -306,8 +237,8 @@ gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
 
 
 
-// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
   %1 = arith.constant dense<1.0>: vector<32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -318,8 +249,8 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 }
 
 
-// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
   %1 = arith.constant dense<1.0>: vector<2xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -329,8 +260,8 @@ gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
@@ -338,17 +269,9 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -356,18 +279,9 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
 
-// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
+// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @create_tdesc_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>>
@@ -375,18 +289,9 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
 
-// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
+// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref<?xf32>) {
+gpu.func @create_tdesc_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
@@ -394,17 +299,9 @@ gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc_3(%src: ui64) {
+// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -413,17 +310,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_create_tdesc_simt_3(%arg0: ui64) {
-gpu.func @test_create_tdesc_simt_3(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_load_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -435,8 +323,8 @@ gpu.func @test_load_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt(%src: ui64) {
+// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -448,8 +336,8 @@ gpu.func @test_load_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -461,8 +349,8 @@ gpu.func @test_load_vc_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -474,8 +362,8 @@ gpu.func @test_load_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -487,8 +375,8 @@ gpu.func @test_load_vc_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -500,8 +388,8 @@ gpu.func @test_load_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -517,8 +405,8 @@ gpu.func @test_store_vc(%src: ui64) {
 
 
 
-// CHECK: gpu.func @test_store_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt(%src: ui64) {
+// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -532,8 +420,8 @@ gpu.func @test_store_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -549,8 +437,8 @@ gpu.func @test_store_vc_2(%src: ui64) {
 
 
 
-// CHECK: gpu.func @test_store_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -564,8 +452,8 @@ gpu.func @test_store_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -580,8 +468,8 @@ gpu.func @test_store_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_store_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -595,20 +483,8 @@ gpu.func @test_store_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
-
-// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_vc(%src: ui64) {
+// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
+gpu.func @prefetch(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -618,21 +494,9 @@ gpu.func @test_prefetch_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_update_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
@@ -644,29 +508,29 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
-gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
+// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
+gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
-gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) {
+// CHECK: gpu.func @simt_dpas(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
+gpu.func @simt_dpas(%a : vector<8xf16>, %b: vector<16xf16>) {
   // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
-gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
+// CHECK: gpu.func @subgroup_dpas_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
+gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
+// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
+gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
   //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -715,23 +579,4 @@ gpu.func @fence() {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  gpu.return
-}
-
-gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
-gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index c7c82fc8dbb3c..35ac39d074c70 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s
 
-// CHECK: function: test_dpas_f16:
+// CHECK: function: dpas_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -23,7 +23,7 @@
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -38,7 +38,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 
 
 // -----
-// CHECK: function: test_dpas_i8:
+// CHECK: function: dpas_i8:
 // CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
@@ -51,7 +51,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
   %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
@@ -60,7 +60,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 }
 
 // -----
-// CHECK: function: test_load_with_transpose_effect:
+// CHECK: function: load_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -83,7 +83,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -97,7 +97,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 }
 
 // -----
-// CHECK: function: test_vector_transpose:
+// CHECK: function: vector_transpose:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -122,7 +122,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -137,7 +137,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 }
 
 // -----
-// CHECK: function: test_extf_truncf:
+// CHECK: function: extf_truncf:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -152,7 +152,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: Not assigned.
-func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
+func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
@@ -162,7 +162,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_load_gather_with_transpose_effect:
+// CHECK: function: load_gather_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
@@ -187,7 +187,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
@@ -202,7 +202,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 }
 
 // -----
-// CHECK: function: test_load_gather_1d:
+// CHECK: function: load_gather_1d:
 // CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -215,7 +215,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T1]] = xegpu.load %[[T0]], %[[CST0]]  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -225,7 +225,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 }
 
 // -----
-// CHECK: function: test_store_scatter_with_transpose_effect:
+// CHECK: function: store_scatter_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
@@ -236,7 +236,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
-func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
+func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
   %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -246,7 +246,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 }
 
 // -----
-// CHECK: function: test_store_scatter_1d:
+// CHECK: function: store_scatter_1d:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
@@ -257,7 +257,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
+func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -266,7 +266,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i16_to_i8:
+// CHECK: function: vector_bitcast_i16_to_i8:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
@@ -289,7 +289,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
@@ -303,7 +303,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i8_to_f16:
+// CHECK: function: vector_bitcast_i8_to_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
@@ -328,7 +328,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
+func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
@@ -343,7 +343,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 }
 
 // -----
-// CHECK: function: test_binary_op_one_use:
+// CHECK: function: binary_op_one_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -360,7 +360,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
+func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -371,7 +371,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 }
 
 // -----
-// CHECK: function: test_binary_op_multiple_uses:
+// CHECK: function: binary_op_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -390,7 +390,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
+func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
@@ -402,7 +402,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 }
 
 // -----
-// CHECK: function: test_for_op:
+// CHECK: function: for_op:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
@@ -437,7 +437,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 // CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c16 = arith.constant 16 : index
@@ -458,7 +458,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 }
 
 // -----
-// CHECK: function: test_if_single_use:
+// CHECK: function: if_single_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -477,7 +477,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
+func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -492,7 +492,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 }
 
 // -----
-// CHECK: function: test_if_multiple_uses:
+// CHECK: function: if_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -513,7 +513,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
+func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -529,7 +529,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 }
 
 // -----
-// CHECK: function: test_vector_outer_reduction:
+// CHECK: function: vector_outer_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -538,7 +538,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
@@ -546,7 +546,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_vector_inner_reduction:
+// CHECK: function: vector_inner_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -555,7 +555,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 8e3673d04eacb..67d3bd9b393c0 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -4,7 +4,7 @@
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -45,7 +45,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -86,7 +86,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
@@ -130,7 +130,7 @@ gpu.module @test_kernel {
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -172,7 +172,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
@@ -211,7 +211,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index b911bb3bbdc1c..41414d802f212 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -2,7 +2,7 @@
 
 gpu.module @test {
 
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
@@ -10,31 +10,31 @@ gpu.module @test {
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>,
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {__xegpu_blocking_tile_shape__ = array<i64: 8, 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_create_nd_tdesc_1d
+  // CHECK-LABEL: create_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
   // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {__xegpu_blocking_tile_shape__ = array<i64: 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc
+  // CHECK-LABEL: update_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -42,11 +42,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc_1d
+  // CHECK-LABEL: update_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     %update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
@@ -54,11 +54,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return
@@ -66,23 +66,23 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc_1d
+  // CHECK-LABEL: prefetch_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
+  gpu.func @prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return
   }
 
   //-----
-  // CHECK-LABEL: test_load_nd
+  // CHECK-LABEL: load_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32>
-  gpu.func @test_load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
+  gpu.func @load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
     gpu.return %ld : vector<24x32xf32>
@@ -90,12 +90,12 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_load_nd_1d
+  // CHECK-LABEL: load_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
   // CHECK-COUNT-4: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<16xf32> into vector<64xf32>
-  gpu.func @test_load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
+  gpu.func @load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = xegpu.load_nd %tdesc: !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>> -> vector<64xf32>
     gpu.return %data : vector<64xf32>
@@ -103,11 +103,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.store_nd {{.*}}  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     xegpu.store_nd %data, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -116,11 +116,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd_1d
+  // CHECK-LABEL: store_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.store_nd {{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.func @test_store_nd_1d(%src: memref<64xf32>) {
+  gpu.func @store_nd_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = arith.constant dense<9.0> : vector<64xf32>
     xegpu.store_nd %data, %tdesc: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
@@ -129,7 +129,7 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_createNd_loadNd_storeNd
+  // CHECK-LABEL: createNd_loadNd_storeNd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   //CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -137,7 +137,7 @@ gpu.module @test {
   //CHECK: [[add:%.+]] = arith.addf {{.*}} : vector<24x32xf32>
   //CHECK-COUNT-6: [[extract:%.+]] = vector.extract_strided_slice {{.*}} : vector<24x32xf32> to vector<8x16xf32>
   //CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
+  gpu.func @createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
@@ -148,14 +148,237 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: [[arg0:%.+]]: vector<32x32xf16>, [[arg1:%.+]]: vector<32x32xf16>
   //CHECK-COUNT-8: [[extract1:%.+]] = vector.extract_strided_slice [[arg0]] {{.*}} : vector<32x32xf16> to vector<8x16xf16>
   //CHECK-COUNT-4: [[extract2:%.+]] = vector.extract_strided_slice [[arg1]] {{.*}} : vector<32x32xf16> to vector<16x16xf16>
   //CHECK-COUNT-16: [[dpas:%.+]] = xegpu.dpas {{.*}} -> vector<8x16xf32>
   //CHECK-COUNT-8: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
-  gpu.func @test_dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
+  gpu.func @dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
     %c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
     gpu.return %c : vector<32x32xf32>
   }
+
+//-----
+
+  // CHECK-LABEL: create_tdesc_vec
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>,  #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: create_tdesc_step
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: load
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  gpu.func @load(%src: ui64) -> vector<32xf32> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+      
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+      
+    gpu.return %ld : vector<32xf32> 
+  }
+
+//-----
+
+  // CHECK-LABEL: prefetch
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @prefetch(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return
+  }
+
+//-----
+
+  // CHECK-LABEL: store
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  gpu.func @store(%src: ui64) {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %st_vec = arith.constant dense<1023.0>: vector<32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1>
+    
+    gpu.return
+  }
+
+//-----
+  // CHECK-LABEL: create_tdesc_step_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4 : i64>>
+  gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+  }
+
+//-----
+  // CHECK-LABEL: create_tdesc_step_chunk2
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }
+
+// CHECK-LABEL: create_tdesc_step_chunk3
+  // CHECK-SAME: [[arg0:%.+]]: ui64  
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+    gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<16xindex>
+    %seq = vector.step  : vector<16xindex>
+    %cst = arith.muli %seq, %step : vector<16xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+  }
+
+//-----
+  // CHECK-LABEL: load_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.load  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<2x16xf32>
+
+  gpu.func @load_chunk(%src: ui64) -> vector<4x32xf32> {
+    %cst = arith.constant dense<[
+        0,   8,  16,  24,  32,  40,  48,  56,
+        64,  72,  80,  88,  96, 104, 112, 120,
+        128, 136, 144, 152, 160, 168, 176, 184,
+        192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> 
+    %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<4x32xf32>
+    
+    gpu.return %ld : vector<4x32xf32> 
+   }
+
+//-----
+  // CHECK-LABEL: store_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} :  ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.store  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+  gpu.func @store_chunk(%src: ui64) {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %st_vec = arith.constant dense<1023.>: vector<4x32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: vector<4x32xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16,2]>>, vector<32xi1>
+    
+    gpu.return
+  }
+
+//-----
+  // CHECK-LABEL: prefetch_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @prefetch_chunk(%src: ui64)  {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+      ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    
+    gpu.return
+  }
+
+//-----
+  // CHECK-LABEL: update_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
+  gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %delta = arith.constant dense<32>: vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+        : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
+
+    gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }  
 }
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
new file mode 100644
index 0000000000000..64f01d61d6e80
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+
+gpu.module @test_elementwise_ops {
+  // CHECK-LABEL: unary_ops
+  gpu.func @unary_ops(%a: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: math.exp {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %exp = math.exp %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.negf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: binary_ops
+  gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %addf = arith.addf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: math.powf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: ternary_ops
+  gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
+      -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi1>
+    // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32>
+    %select = arith.select %load_c, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi1>, vector<24x32xf32>
+    // CHECK: math.fma  {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %fma = math.fma %load_a, %load_b, %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: type_conversion_ops
+  gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.truncf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16>
+    %truncf = arith.truncf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32> to vector<24x32xf16>
+    // CHECK: arith.bitcast {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32>
+    %bitcast = arith.bitcast %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32> to vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: comparison_ops
+  gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    %load_d = xegpu.load_nd %tdesc_d
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %cmpf = arith.cmpf ult, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32>
+    %cmpi = arith.cmpi eq, %load_c, %load_d
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32>
+    gpu.return
+  }
+
+  // 1 to N decomposition of elementwise operations
+  // CHECK-LABEL: elementwise_ops_rr_assignment
+  gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: arith.negf
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: math.powf
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 35ad16d8cd9a9..c6124f90e0f48 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
 gpu.module @test_round_robin_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
       // CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
       // CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
@@ -12,9 +12,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.load_nd %{{.*}}
@@ -27,9 +27,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
@@ -43,9 +43,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
   }
 
-  // CHECK-LABEL: test_update_nd
+  // CHECK-LABEL: update_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_update_nd(%src: memref<24x32xf32>){
+  gpu.func @update_nd(%src: memref<24x32xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
       ->  !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
@@ -56,9 +56,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>)
-  gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
+  gpu.func @dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.create_nd_tdesc
@@ -90,9 +90,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
     // CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
@@ -103,7 +103,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1 = arith.constant 1 : index
     %c10 = arith.constant 10 : index
     %c0 = arith.constant 0 : index
@@ -126,7 +126,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -150,7 +150,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %0 = gpu.subgroup_id : index
     %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -173,7 +173,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 466842c968448..44b11c304cc80 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -3,9 +3,9 @@
 //CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[SGID:.*]] = gpu.subgroup_id
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
@@ -30,9 +30,9 @@ gpu.module @test_1_1_assignment {
   gpu.return
   }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -46,9 +46,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -66,9 +66,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
 }
 
-// CHECK-LABEL: test_update_nd
+// CHECK-LABEL: update_nd
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-gpu.func @test_update_nd(%src: memref<24x32xf32>){
+gpu.func @update_nd(%src: memref<24x32xf32>){
   // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
   // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
   // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
@@ -80,10 +80,10 @@ gpu.func @test_update_nd(%src: memref<24x32xf32>){
   gpu.return
 }
 
-// CHECK-LABEL: test_dpas
+// CHECK-LABEL: dpas
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -114,10 +114,10 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
   }
 
 
-// CHECK-LABEL: test_dpas_no_sg_data
+// CHECK-LABEL: dpas_no_sg_data
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -147,9 +147,9 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: xegpu.prefetch_nd %[[TDESC]]
@@ -161,8 +161,8 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas_with_no_create_nd_desc
-  gpu.func @test_dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
+  // CHECK-LABEL: dpas_with_no_create_nd_desc
+  gpu.func @dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
     // CHECK-NOT: vector<12x12xf32>
     %dpas = xegpu.dpas %a, %b
       {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
@@ -170,7 +170,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
@@ -213,7 +213,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -238,7 +238,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
@@ -267,7 +267,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
new file mode 100644
index 0000000000000..5f6e8e4c30892
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
@@ -0,0 +1,463 @@
+// REQUIRES: arm-emulator
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --convert-vector-to-scf --convert-scf-to-cf  --convert-vector-to-llvm='enable-arm-sve enable-arm-i8mm' \
+// DEFINE:   --expand-strided-metadata --convert-to-llvm --finalize-memref-to-llvm  \
+// DEFINE:   --lower-affine --convert-arith-to-llvm --reconcile-unrealized-casts \
+// DEFINE: -o %t
+
+// DEFINE: %{entry_point} = main
+
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
+
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
+
+#packed_maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+//
+// Test the lowering of `vector.contract` using the `LowerContractionToSVEI8MMPattern`
+//
+// The operation that the `vector.contract` in this test performs is matrix
+// multiplication with accumulate
+//     OUT = ACC + LHS * RHS
+// of two 8-bit integer matrices LHS and RHS, and a 32-bit integer matrix ACC
+// into a 32-bit integer matrix OUT. The LHS and RHS can be sign- or zero- extended,
+// this test covers all the possible variants.
+//
+// Tested are calculations as well as that the relevant `ArmSVE` dialect
+// operations ('arm_sve.smmla`, arm_sve.ummla`, etc) are emitted.
+//
+// That pattern above handles (therefore this test prepares) input/output vectors with
+// specific shapes:
+//   * LHS:      vector<Mx8xi8>
+//   * RHS:      vector<[N]x8xi8>
+//   * ACC, OUT: vector<Mx[N]xi32>
+// Note that the RHS is transposed.
+// This data layout makes it efficient to load data into SVE
+// registers in the layout expected by FEAT_I8MM instructions.
+// Such a `vector.contract` is representative of the code we aim to generate
+// by scalable vectorisation of `linalg.mmt4d`.
+// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+// for more information and rationale about these shapes.
+//
+// In this specific test we use M == 4 and N == 4
+//
+
+// Allocate and initialise a memref containing test data for use as the ACC
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the columns of the memref.
+func.func private @prepareAccTestData(%in: vector<4x4xi32>) -> memref<4x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<4x?xi32>
+
+  scf.for %j = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%c0, %j] {in_bounds = [true, true]} : vector<4x4xi32>, memref<4x?xi32>
+  }
+
+  return %mem : memref<4x?xi32>
+}
+
+// Allocate and initialise a memref containing test data for use as the LHS
+// operand. This function just writes the parameter `%in` into the memref.
+// The size of the LHS does not depends on VSCALE.
+func.func private @prepareLHSTestData(%in: vector<4x8xi8>) -> memref<4x8xi8> {
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %mem = memref.alloc() : memref<4x8xi8>
+  vector.transfer_write %in, %mem[%c0, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<4x8xi8>
+
+  return %mem : memref<4x8xi8>
+}
+
+// Allocate and initialise a memref containing test data for use as the RHS
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the rows of the memref.
+//
+// For convenience, flatten the memref, since the RHS vector is read first as a
+// single-dimensional scalable vector and then cast into [N]x8 shape.
+func.func private @prepareRHSTestData(%in: vector<4x8xi8>) -> memref<?xi8> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<?x8xi8>
+
+  scf.for %i = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%i, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<?x8xi8>
+  }
+
+  %mem_out = memref.collapse_shape %mem [[0, 1]] : memref<?x8xi8> into memref<?xi8>
+  return %mem_out : memref<?xi8>
+}
+
+// Test the operation where both LHS and RHS are interpreted as signed, hence
+// we ultimately emit and execute the `smmla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_smmla
+// CHECK-IR-COUNT-4: arm_sve.intr.smmla
+func.func @test_smmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // FIXME: Workaround for a crash, see https://github.com/llvm/llvm-project/issues/143670
+  %acc_cast = memref.cast %acc_mem : memref<4x?xi32> to memref<*xi32>
+  call @printMemrefI32(%acc_cast) : (memref<*xi32>) -> ()
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35],
+                                   [-23,  39,  48,  26, -23,  32, -39, -38]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where both LHS and RHS are interpreted as unsigned, hence
+// we ultimately emit and execute the `ummla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_ummla
+// CHECK-IR-COUNT-4: arm_sve.intr.ummla
+func.func @test_ummla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[16, 16, 48, 40],
+                                   [40, 24, 35, 12],
+                                   [33, 24, 29, 19],
+                                   [28, 13, 33, 18]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[35, 42, 37, 49, 36, 36, 23, 33],
+                                   [39, 34, 33, 45, 43, 10, 44, 47],
+                                   [18, 35, 29, 25, 36, 33, 28, 29],
+                                   [26, 49, 43, 32, 27, 16, 45, 33]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[18, 31, 37, 35, 44, 22, 37, 28],
+                                   [21, 22, 49, 39, 30, 28, 35, 37],
+                                   [21, 47, 39, 35, 23, 43, 24, 49],
+                                   [49, 49, 40, 32, 37, 20, 47, 40]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(UMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as unsigned and RHS is
+// interpreted as signed, hence we ultimately emit and execute the `usmmla`
+// instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_usmmla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_usmmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[153, 161,  24, 157, 211, 154,  52,  27],
+                                   [168,  77, 136, 124, 249,  28,  13, 122],
+                                   [ 97,  82, 181,  39,  53,  25,  80, 240],
+                                   [184, 227, 106, 165, 126, 113, 121, 228]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[ 40,  27,  37,  43,  38,  -6,  37,  49],
+                                   [-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(USMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as signed and RHS is interpreted
+// as unsigned. In this test we ultimately emit end execute the `usmmla`
+// instruction with reversed operands, see `LowerContractionToSVEI8MMPattern.cpp`
+// for more details.
+
+// CHECK-IR-LABEL: llvm.func @test_summla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_summla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[125, 171, 138, 187, 108, 175,  82,  99],
+                                   [221,  25, 164,  97, 156, 221, 218, 177],
+                                   [171, 160, 219, 191, 144,  45, 161, 210],
+                                   [223, 165, 123,  99, 108,  86,  37,  92]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SUMMLA (i.e. USMMLA transposed)):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Perform each test with SVE vector lengths 128 bits and 256 bits (i.e. VSCALEs
+// 1 and 2, respectively). The vector length is set via the `setArmVLBits`
+// function. The effect of setting a different vector length is that the tests
+// allocate and operate on different sized buffers (see `prepare<X>TestData`
+// functions).
+
+func.func @main() {
+  %c128 = arith.constant 128 : i32
+  %c256 = arith.constant 256 : i32
+
+// CHECK-LABEL: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879, -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685, -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357,  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365,  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK-LABEL: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314, 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088, 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044, 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191, 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK-LABEL: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409, 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274, 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382, 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623, 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK-LABEL: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+// CHECK: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575, -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521,  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764,   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627,  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
+func.func private @printMemrefI32(%ptr : memref<*xi32>)
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
index cc3b47a54dfe9..c623df0b605b2 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
@@ -36,14 +36,17 @@ bbd:
 
 ; // -----
 
+; Verify that a single weight attached to a call is not translated.
+; The MLIR WeightedBranchOpInterface does not support this case.
+
 ; CHECK: llvm.func @fn()
-declare void @fn()
+declare i32 @fn()
 
 ; CHECK-LABEL: @call_branch_weights
-define void @call_branch_weights() {
-  ; CHECK:  llvm.call @fn() {branch_weights = array<i32: 42>}
-  call void @fn(), !prof !0
-  ret void
+define i32 @call_branch_weights() {
+  ; CHECK:  llvm.call @fn() : () -> i32
+  %1 = call i32 @fn(), !prof !0
+  ret i32 %1
 }
 
 !0 = !{!"branch_weights", i32 42}
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 24a7b42557278..a8ef401fff27e 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -448,3 +448,19 @@ llvm.mlir.global external constant @const() {addr_space = 0 : i32, dso_local} :
 }
 
 llvm.func extern_weak @extern_func()
+
+// -----
+
+llvm.func @invoke_branch_weights_callee()
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @invoke_branch_weights() -> i32 attributes {personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // expected-error @below{{expects number of branch weights to match number of successors: 1 vs 2}}
+  llvm.invoke @invoke_branch_weights_callee() to ^bb2 unwind ^bb1 {branch_weights = array<i32 : 42>} : () -> ()
+^bb1:  // pred: ^bb0
+  %1 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.br ^bb2
+^bb2:  // 2 preds: ^bb0, ^bb1
+  llvm.return %0 : i32
+}
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 7742259e7a478..fc1993b50ba2d 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1906,32 +1906,6 @@ llvm.func @cond_br_weights(%cond : i1, %arg0 : i32,  %arg1 : i32) -> i32 {
 
 // -----
 
-llvm.func @fn()
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> ()
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
-llvm.func @fn() -> i32
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  %res = llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> i32
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
 llvm.func @foo()
 llvm.func @__gxx_personality_v0(...) -> i32
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
new file mode 100644
index 0000000000000..a755cef98d7c4
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
+  omp.private {type = private} @_QFEj_private_i32 : i32 loc(#loc1)
+  omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1)
+  llvm.func @test() {
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc4)
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(16383 : index) : i64
+    %10 = llvm.mlir.constant(0 : index) : i64
+    %11 = llvm.mlir.constant(1 : index) : i64
+    %12 = llvm.mlir.constant(16384 : i32) : i32
+    %14 = llvm.mlir.addressof @_QFEarray : !llvm.ptr
+    %18 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    %20 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"} loc(#loc3)
+    %22 = omp.map.bounds lower_bound(%10 : i64) upper_bound(%9 : i64) extent(%9 : i64) stride(%11 : i64) start_idx(%11 : i64) loc(#loc3)
+    %23 = omp.map.info var_ptr(%14 : !llvm.ptr, !llvm.array<16384 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%22) -> !llvm.ptr {name = "array"} loc(#loc3)
+    %24 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    omp.target map_entries(%18 -> %arg0, %20 -> %arg2, %23 -> %arg4, %24 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %25 = llvm.mlir.constant(1 : i32) : i32
+      %27 = llvm.mlir.constant(16384 : i32) : i32
+      omp.teams {
+        omp.distribute private(@_QFEi_private_i32 %arg5 -> %arg6 : !llvm.ptr) {
+          omp.loop_nest (%arg7) : i32 = (%25) to (%27) inclusive step (%25) {
+            omp.parallel {
+              omp.wsloop private(@_QFEj_private_i32 %arg2 -> %arg8 : !llvm.ptr) {
+                omp.loop_nest (%arg9) : i32 = (%25) to (%27) inclusive step (%25) {
+                  llvm.store %arg9, %arg8 : i32, !llvm.ptr loc(#loc9)
+                  omp.yield
+                } loc(#loc9)
+              } loc(#loc9)
+              omp.terminator loc(#loc9)
+            } loc(#loc9)
+            omp.yield loc(#loc9)
+          } loc(#loc9)
+        } loc(#loc9)
+        omp.terminator loc(#loc9)
+      } loc(#loc9)
+      omp.terminator loc(#loc9)
+    } loc(#loc9)
+    llvm.return loc(#loc9)
+  } loc(#loc14)
+  llvm.mlir.global internal @_QFEarray() {addr_space = 0 : i32} : !llvm.array<16384 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<16384 x i32>
+    llvm.return %0 : !llvm.array<16384 x i32>
+  } loc(#loc2)
+}
+#di_file = #llvm.di_file<"test.f90" in "">
+#di_null_type = #llvm.di_null_type
+#loc1 = loc("test.f90":4:23)
+#loc2 = loc("test.f90":4:15)
+#loc3 = loc("test.f90":1:7)
+#loc4 = loc("test.f90":4:18)
+#loc9 = loc("test.f90":13:11)
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang", isOptimized = true, emissionKind = LineTablesOnly>
+#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
+#di_subprogram = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #di_compile_unit, scope = #di_file, name = "main", file = #di_file, subprogramFlags = "Definition|Optimized|MainSubprogram", type = #di_subroutine_type>
+#loc14 = loc(fused<#di_subprogram>[#loc3])
+
+
+// CHECK: call void @__kmpc_distribute_static{{.*}}!dbg
+
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
index f2948c6510138..0f2437639319a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -126,7 +126,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 
 // CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
-// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: %[[SHARED_PTR:.+]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASKDATA]], i32 0, i32 0
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[SHARED_PTR]], align 8
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
 
 // CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
index b487b31d54477..5eee7b7d7d976 100644
--- a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
@@ -13,19 +13,48 @@ module attributes {omp.target_triples = ["dummy-target-triple"]} {
     }
     llvm.return
   }
+}
 
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [1 x ptr], [1 x ptr] }
 
 // CHECK: define void @_QPfoo() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [1 x ptr], align 8
+
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
 
-// CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
-// CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
 
-// CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
+// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
+// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
+// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 8, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 8, i1 false)
+// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @_QPfoo..omp_par(i32 %{{.*}}, ptr %{{.*}})
-// CHECK: }
-}
+// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 8, i1 false)
+// CHECK:   call void @[[WORKER]](i32 %{{.*}}, ptr %{{.*}})
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
new file mode 100644
index 0000000000000..19333c44322f1
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s 2>&1 | FileCheck %s
+
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @launch_(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}) {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x f64 {bindc_name = "n"} : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%2 : !llvm.ptr)  -> !llvm.ptr {name = ""}
+    %4 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%3 : [0] : !llvm.ptr) -> !llvm.ptr {name = "a"}
+    %5 = omp.map.info var_ptr(%1 : !llvm.ptr, f64) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+    omp.target nowait map_entries(%4 -> %arg1, %5 -> %arg2, %3 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %two_f = llvm.mlir.constant(2.000000e+00 : f64) : f64
+      %one_i = llvm.mlir.constant(1 : index) : i64
+      %6 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+      %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+      %8 = llvm.getelementptr %7[%one_i] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+      %9 = llvm.load %8 : !llvm.ptr -> f64
+      %10 = llvm.fmul %9, %two_f {fastmathFlags = #llvm.fastmath<contract>} : f64
+      llvm.store %10, %8 : f64, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [5 x ptr], [5 x ptr], [5 x i64] }
+
+// CHECK: define void @launch_(ptr captures(none) %0)
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[SIZES:.*]] = alloca [5 x i64], align 4
+
+
+// CHECK: %[[VAL_20:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[SIZES_GEP:.*]] = getelementptr inbounds [5 x i64], ptr %[[SIZES]], i32 0, i32 0
+
+// CHECK: %[[GL_THRD_NUM:.*]] = call i32 @__kmpc_global_thread_num
+// CHECK: %[[TASK_DESC:.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @4, i32 {{.*}}, i32 0, i64 160, i64 16, ptr [[TGT_TSK_PRXY_FNC:.*]], i64 -1)
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 16, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_54:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 2
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_54]], ptr align 1 %[[SIZES_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_omp_task(ptr @4, i32 %[[GL_THRD_NUM]], ptr %[[TASK_DESC]])
+
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
+
+// CHECK: define internal void [[TGT_TSK_PRXY_FNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 2
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 16, i1 false)
+// CHECK: call void @[[WORKER]](i32 %[[THREAD_ID_PARAM]], ptr %[[BASEPTRS]], ptr %[[PTRS]], ptr %[[SIZES]], ptr %[[STRUCTARG]])
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index 8124d02ef2174..dba8c553aaca5 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -14,25 +14,20 @@ llvm.func @_QPopenmp_target_data_enter() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_ENTER:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_ENTER:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_begin_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -51,25 +46,20 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_UPDATE:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_UPDATE:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_update_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -88,23 +78,18 @@ llvm.func @_QPopenmp_target_data_exit() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_EXIT:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_EXIT:.*]](i32 %{{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_end_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }
diff --git a/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
new file mode 100644
index 0000000000000..7a448f74ed648
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// Regression test for a compiler crash. Ensure that the insertion point is set
+// correctly when triggering the charbox hack multiple times.
+// Nonsense test code to minimally reproduce the issue.
+
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  omp.private {type = private} @_QFEc2_private_box_heap_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(0 : i64) : i64
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%3, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %6 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+    %7 = llvm.icmp "eq" %6, %1 : i64
+    llvm.cond_br %7, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb2:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb3:  // 2 preds: ^bb1, ^bb2
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    omp.yield
+  }
+  omp.private {type = private} @_QFEc1_private_box_ptr_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i32) : i32
+    %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%2, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c2"} : (i64) -> !llvm.ptr
+    %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c1"} : (i64) -> !llvm.ptr
+    omp.task private(@_QFEc1_private_box_ptr_c8xU %2 -> %arg0, @_QFEc2_private_box_heap_c8xU %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: @_QQmain() {
+// CHECK:         %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+// CHECK:         br label %[[VAL_2:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_3:.*]]
+// CHECK:         br label %[[VAL_4:.*]]
+// CHECK:       omp.private.init:                                 ; preds = %[[VAL_2]]
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr null, i32 1) to i64))
+// CHECK:         %[[VAL_6:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 1
+// ...
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.private.init4:                                ; preds = %[[VAL_10:.*]], %[[VAL_11:.*]]
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       omp.private.init3:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13:.*]]
+// CHECK:       omp.private.init2:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13]]
+// CHECK:       omp.private.init1:                                ; preds = %[[VAL_4]]
+// CHECK:         %[[VAL_14:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[VAL_14]], ptr %[[VAL_0]], i32 24, i1 false)
+// CHECK:         %[[VAL_15:.*]] = ptrtoint ptr %[[VAL_0]] to i64
+// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_15]], 0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_10]], label %[[VAL_11]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_17:.*]] = phi ptr [ %[[VAL_7]], %[[VAL_13]] ]
+// CHECK:         br label %[[VAL_18:.*]]
+// CHECK:       omp.private.copy:                                 ; preds = %[[VAL_12]]
+// CHECK:         br label %[[VAL_19:.*]]
+// CHECK:       omp.task.start:                                   ; preds = %[[VAL_18]]
+// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:       codeRepl:                                         ; preds = %[[VAL_19]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_5]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_23:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_22]], i32 1, i64 40, i64 8, ptr @_QQmain..omp_par)
+// CHECK:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_23]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_24]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK:         %[[VAL_25:.*]] = call i32 @__kmpc_omp_task(ptr @1, i32 %[[VAL_22]], ptr %[[VAL_23]])
diff --git a/mlir/test/Target/SPIRV/constant.mlir b/mlir/test/Target/SPIRV/constant.mlir
index 8d4e53418b70f..50d9b09ee0042 100644
--- a/mlir/test/Target/SPIRV/constant.mlir
+++ b/mlir/test/Target/SPIRV/constant.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+// RUN: %if spirv-tools %{ mlir-translate -no-implicit-module -serialize-spirv %s | spirv-val %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int64, Int16, Int8, Float64, Float16, CooperativeMatrixKHR], [SPV_KHR_vulkan_memory_model, SPV_KHR_cooperative_matrix]> {
   // CHECK-LABEL: @bool_const
   spirv.func @bool_const() -> () "None" {
     // CHECK: spirv.Constant true
@@ -305,4 +306,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %coop = spirv.Constant dense<4> : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
     spirv.ReturnValue %coop : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
   }
+
+  spirv.EntryPoint "GLCompute" @bool_const
 }
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index eacf36bfba9ce..832f7ea2fe314 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -128,6 +128,10 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %8 = spirv.GL.FindSMsb %arg3 : vector<3xi32>
     // CHECK: {{%.*}} = spirv.GL.FindUMsb {{%.*}} : vector<3xi32>
     %9 = spirv.GL.FindUMsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : f32 -> f32
+    %10 = spirv.GL.Length %arg0 : f32 -> f32
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+    %11 = spirv.GL.Length %arg1 : vector<3xf32> -> f32
     spirv.Return
   }
 
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 3f3461e92bc08..4400d6d9625f7 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -19,6 +19,10 @@ using namespace mlir::xegpu;
 
 namespace {
 
+#define DEBUG_TYPE "test-xegpu-unroll"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 struct TestXeGPUUnrollingPatterns
     : public PassWrapper<TestXeGPUUnrollingPatterns,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -48,7 +52,9 @@ struct TestXeGPUUnrollingPatterns
     options.setNativeShapeFn(
         [&](Operation *op) -> std::optional<SmallVector<int64_t>> {
           if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
-                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp>(op)) {
+                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
+                  xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
+                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
             xegpu::TensorDescType tdescTy;
             if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
               tdescTy = createNdOp.getType();
@@ -61,6 +67,16 @@ struct TestXeGPUUnrollingPatterns
               tdescTy = loadNdOp.getTensorDescType();
             } else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
               tdescTy = storeNdOp.getTensorDescType();
+            } else if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
+              tdescTy = createOp.getType();
+            } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
+              tdescTy = updateOp.getTensorDescType();
+            } else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
+              tdescTy = prefetchOp.getTensorDescType();
+            } else if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
+              tdescTy = loadOp.getTensorDescType();
+            } else if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(op)) {
+              tdescTy = storeOp.getTensorDescType();
             }
 
             if (auto layout = tdescTy.getLayoutAttr()) {
@@ -88,14 +104,40 @@ struct TestXeGPUUnrollingPatterns
             Attribute encoding = tdescTy.getEncoding();
             auto layout = llvm::dyn_cast_if_present<xegpu::LayoutAttr>(
                 tdescTy.getLayout());
+
+            // If the encoding is a ScatterTensorDescAttr, we need to
+            // potentially adjust the chunk size based on the inst_data.
+            if (encoding && mlir::isa<xegpu::ScatterTensorDescAttr>(encoding)) {
+              auto scatterAttr =
+                  mlir::dyn_cast<xegpu::ScatterTensorDescAttr>(encoding);
+              int64_t chunkSize = scatterAttr.getChunkSize().getInt();
+
+              if (chunkSize > 1) {
+                int64_t blockedChunkSize = chunkSize;
+                auto instData = layout.getInstData();
+                if (!instData.empty())
+                  blockedChunkSize = instData.asArrayRef().back();
+
+                auto chunkSizeAttr = mlir::IntegerAttr::get(
+                    mlir::IntegerType::get(ctx, 64), blockedChunkSize);
+
+                // To create a new attribute with a different chunk_size:
+                auto newEncoding = xegpu::ScatterTensorDescAttr::get(
+                    ctx, scatterAttr.getMemorySpace(), chunkSizeAttr);
+
+                encoding = newEncoding;
+              }
+            }
             if (layout) {
               if (layout.getLaneLayout() == nullptr)
                 layout = xegpu::LayoutAttr();
               else
                 layout = layout.dropInstData();
             }
+
             newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                                layout);
+
           } else {
             newTy = type.clone(tileShape, elemTy);
           }
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62befc..a6f1ac0d568f4 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -332,6 +332,7 @@ def find_real_python_interpreter():
 else:
     config.available_features.add("noasserts")
 
+config.targets = frozenset(config.targets_to_build.split())
 
 def have_host_jit_feature_support(feature_name):
     mlir_runner_exe = lit.util.which("mlir-runner", config.mlir_tools_dir)
diff --git a/mlir/test/lit.local.cfg b/mlir/test/lit.local.cfg
new file mode 100644
index 0000000000000..167c454db5184
--- /dev/null
+++ b/mlir/test/lit.local.cfg
@@ -0,0 +1,7 @@
+if not "SPIRV" in config.root.targets:
+    config.unsupported = True
+
+if config.spirv_tools_tests:
+    config.available_features.add("spirv-tools")
+    config.substitutions.append(("spirv-as", os.path.join(config.llvm_tools_dir, "spirv-as")))
+    config.substitutions.append(("spirv-val", os.path.join(config.llvm_tools_dir, "spirv-val")))
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 132aabe135940..77f24e0f29b09 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -5,6 +5,8 @@ import sys
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.python_executable = "@Python3_EXECUTABLE@"
@@ -41,7 +43,7 @@ config.mlir_run_amx_tests = @MLIR_RUN_AMX_TESTS@
 config.mlir_run_arm_sve_tests = @MLIR_RUN_ARM_SVE_TESTS@
 # This is a workaround for the fact that LIT's:
 #   %if <cond>
-# requires <cond> to be in the set of available features. 
+# requires <cond> to be in the set of available features.
 # TODO: Update LIT's TestRunner so that this is not required.
 if config.mlir_run_arm_sve_tests:
     config.available_features.add("mlir_arm_sve_tests")
diff --git a/mlir/test/mlir-query/complex-test.mlir b/mlir/test/mlir-query/backward-slice-union.mlir
similarity index 71%
rename from mlir/test/mlir-query/complex-test.mlir
rename to mlir/test/mlir-query/backward-slice-union.mlir
index ad96f03747a43..f8f88c2043749 100644
--- a/mlir/test/mlir-query/complex-test.mlir
+++ b/mlir/test/mlir-query/backward-slice-union.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-query %s -c "m getAllDefinitions(hasOpName(\"arith.addf\"),2)" | FileCheck %s
+// RUN: mlir-query %s -c "m anyOf(getAllDefinitions(hasOpName(\"arith.addf\"),2),getAllDefinitions(hasOpName(\"tensor.extract\"),1))" | FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) {
@@ -19,14 +19,23 @@ func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>)
 }
 
 // CHECK: Match #1:
-
 // CHECK: %[[LINALG:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} 
 // CHECK-SAME: ins(%arg0 : tensor<5x5xf32>) outs(%arg1 : tensor<5x5xf32>)
+
+// CHECK: {{.*}}.mlir:7:10: note: "root" binds here
 // CHECK: %[[ADDF1:.*]] = arith.addf %in, %in : f32
 
 // CHECK: Match #2:
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
+// CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 
+// CHECK: {{.*}}.mlir:14:18: note: "root" binds here
+// CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: Match #3:
 // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
 // CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 // CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: {{.*}}.mlir:15:10: note: "root" binds here
 // CHECK: %[[ADDF2:.*]] = arith.addf %[[EXTRACTED]], %[[EXTRACTED]] : f32  
diff --git a/mlir/test/mlir-query/forward-slice-by-predicate.mlir b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
new file mode 100644
index 0000000000000..e11378da89d9f
--- /dev/null
+++ b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-query %s -c "m getUsersByPredicate(anyOf(hasOpName(\"memref.alloc\"),isConstantOp()),anyOf(hasOpName(\"affine.load\"), hasOpName(\"memref.dealloc\")),true)" | FileCheck %s
+
+func.func @slice_depth1_loop_nest_with_offsets() {
+  %0 = memref.alloc() : memref<100xf32>
+  %cst = arith.constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    %a0 = affine.apply affine_map<(d0) -> (d0 + 2)>(%i0)
+    affine.store %cst, %0[%a0] : memref<100xf32>
+  }
+  affine.for %i1 = 4 to 8 {
+    %a1 = affine.apply affine_map<(d0) -> (d0 - 1)>(%i1)
+    %1 = affine.load %0[%a1] : memref<100xf32>
+  }
+  return
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:4:8: note: "root" binds here
+// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<100xf32>
+
+// CHECK: affine.store %cst, %0[%a0] : memref<100xf32>
+
+// CHECK: Match #2:
+// CHECK: {{.*}}.mlir:5:10: note: "root" binds here
+// CHECK: %[[CST:.*]] = arith.constant 7.000000e+00 : f32
+
+// CHECK: affine.store %[[CST]], %0[%a0] : memref<100xf32>
diff --git a/mlir/test/mlir-query/logical-operator-test.mlir b/mlir/test/mlir-query/logical-operator-test.mlir
new file mode 100644
index 0000000000000..ac05428287abd
--- /dev/null
+++ b/mlir/test/mlir-query/logical-operator-test.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-query %s -c "m allOf(hasOpName(\"memref.alloca\"), hasOpAttrName(\"alignment\"))" | FileCheck %s
+
+func.func @dynamic_alloca(%arg0: index, %arg1: index) -> memref<?x?xf32> {
+  %0 = memref.alloca(%arg0, %arg1) : memref<?x?xf32>
+  memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
+  return %0 : memref<?x?xf32>
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:5:3: note: "root" binds here
+// CHECK: memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
diff --git a/mlir/test/mlir-query/slice-function-extraction.mlir b/mlir/test/mlir-query/slice-function-extraction.mlir
new file mode 100644
index 0000000000000..e55d5e77c5736
--- /dev/null
+++ b/mlir/test/mlir-query/slice-function-extraction.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-query %s -c "m getDefinitionsByPredicate(hasOpName(\"memref.store\"),hasOpName(\"memref.alloc\"),true,false,false).extract(\"backward_slice\")" | FileCheck %s
+
+// CHECK:       func.func @backward_slice(%{{.*}}: memref<10xf32>) -> (f32, index, index, f32, index, index, f32) {
+// CHECK:         %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[I0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[C0]]]
+// CHECK-NEXT:    memref.store %[[CST0]], %{{.*}}[%[[I0]]] : memref<10xf32>
+// CHECK-NEXT:    %[[CST2:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[I1:.*]] = affine.apply affine_map<() -> (0)>()
+// CHECK-NEXT:    memref.store %[[CST2]], %{{.*}}[%[[I1]]] : memref<10xf32>
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[LOAD:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    memref.store %[[LOAD]], %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    return %[[CST0]], %[[C0]], %[[I0]], %[[CST2]], %[[I1]], %[[C1]], %[[LOAD]] : f32, index, index, f32, index, index, f32
+
+func.func @slicing_memref_store_trivial() {
+  %0 = memref.alloc() : memref<10xf32>
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.apply affine_map<()[s0] -> (s0)>()[%c0]
+    memref.store %cst, %0[%1] : memref<10xf32>
+    %2 = memref.load %0[%c0] : memref<10xf32>
+    %3 = affine.apply affine_map<()[] -> (0)>()[]
+    memref.store %cst, %0[%3] : memref<10xf32>
+    memref.store %2, %0[%c0] : memref<10xf32>
+  }
+  return
+}
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index eeb95605d7a9a..6c5e4e5505b1c 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -256,30 +256,45 @@ def testReplicateOp(module: Module):
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
 
 
+# CHECK-LABEL: TEST: testApplyRegisteredPassOp
 @run
 def testApplyRegisteredPassOp(module: Module):
+    # CHECK: transform.sequence
     sequence = transform.SequenceOp(
         transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
     )
     with InsertionPoint(sequence.body):
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize"
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # CHECK-SAME:    with options = {"top-down" = false}
+        # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(),
             mod.result,
             "canonicalize",
             options={"top-down": BoolAttr.get(False)},
         )
+        # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
         max_iter = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 10),
         )
+        # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
         max_rewrites = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 1),
         )
-        transform.apply_registered_pass(
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # NB: MLIR has sorted the dict lexicographically by key:
+        # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
+        # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
+        # CHECK-SAME:                    "test-convergence" = true,
+        # CHECK-SAME:                    "top-down" = false}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
             transform.AnyOpType.get(),
             mod,
             "canonicalize",
@@ -290,19 +305,30 @@ def testApplyRegisteredPassOp(module: Module):
                 "max-rewrites": max_rewrites,
             },
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = ["a", "b"]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": ("a", "b")},
+        )
+        # CHECK:   %[[SYMBOL_A:.+]] = transform.param.constant
+        symbol_a = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("a")
+        )
+        # CHECK:   %[[SYMBOL_B:.+]] = transform.param.constant
+        symbol_b = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("b")
+        )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = [%[[SYMBOL_A]], %[[SYMBOL_B]]]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": (symbol_a, symbol_b)},
+        )
         transform.YieldOp()
-    # CHECK-LABEL: TEST: testApplyRegisteredPassOp
-    # CHECK: transform.sequence
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # CHECK-SAME:    with options = {"top-down" = false}
-    # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
-    # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # NB: MLIR has sorted the dict lexicographically by key:
-    # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
-    # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
-    # CHECK-SAME:                    "test-convergence" = true,
-    # CHECK-SAME:                    "top-down" = false}
-    # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
diff --git a/mlir/tools/mlir-query/mlir-query.cpp b/mlir/tools/mlir-query/mlir-query.cpp
index 78c0ec97c0cdf..8a17a33c61838 100644
--- a/mlir/tools/mlir-query/mlir-query.cpp
+++ b/mlir/tools/mlir-query/mlir-query.cpp
@@ -40,12 +40,22 @@ int main(int argc, char **argv) {
   query::matcher::Registry matcherRegistry;
 
   // Matchers registered in alphabetical order for consistency:
+  matcherRegistry.registerMatcher("allOf", query::matcher::internal::allOf);
+  matcherRegistry.registerMatcher("anyOf", query::matcher::internal::anyOf);
+  matcherRegistry.registerMatcher(
+      "getAllDefinitions",
+      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
       "getDefinitions",
       query::matcher::m_GetDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
-      "getAllDefinitions",
-      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
+      "getDefinitionsByPredicate",
+      query::matcher::m_GetDefinitionsByPredicate<query::matcher::DynMatcher,
+                                                  query::matcher::DynMatcher>);
+  matcherRegistry.registerMatcher(
+      "getUsersByPredicate",
+      query::matcher::m_GetUsersByPredicate<query::matcher::DynMatcher,
+                                            query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher("hasOpAttrName",
                                   static_cast<HasOpAttrName *>(m_Attr));
   matcherRegistry.registerMatcher("hasOpName", static_cast<HasOpName *>(m_Op));
diff --git a/mlir/utils/vscode/package-lock.json b/mlir/utils/vscode/package-lock.json
index 1efd5779f5cb2..28454c680177b 100644
--- a/mlir/utils/vscode/package-lock.json
+++ b/mlir/utils/vscode/package-lock.json
@@ -10,6 +10,8 @@
       "dependencies": {
         "base64-js": "^1.5.1",
         "chokidar": "3.5.2",
+        "minimatch": "^3.0.5",
+        "semver": "^7.5.2",
         "vscode-languageclient": "^8.0.2-next.5"
       },
       "devDependencies": {
@@ -89,6 +91,16 @@
         "keytar": "^7.7.0"
       }
     },
+    "node_modules/@vscode/vsce/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/@vscode/vsce/node_modules/xml2js": {
       "version": "0.5.0",
       "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -1195,9 +1207,10 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
+      "license": "ISC",
       "dependencies": {
         "brace-expansion": "^1.1.7"
       },
@@ -1262,22 +1275,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/node-abi/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dev": true,
-      "optional": true,
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/node-addon-api": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-4.3.0.tgz",
@@ -1365,6 +1362,16 @@
         "semver": "^5.1.0"
       }
     },
+    "node_modules/parse-semver/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/parse5": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
@@ -1567,12 +1574,18 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true,
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "license": "ISC",
+      "dependencies": {
+        "lru-cache": "^6.0.0"
+      },
       "bin": {
-        "semver": "bin/semver"
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
       }
     },
     "node_modules/set-blocking": {
@@ -1901,20 +1914,6 @@
         "vscode": "^1.67.0"
       }
     },
-    "node_modules/vscode-languageclient/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/vscode-languageserver-protocol": {
       "version": "3.17.2-next.6",
       "resolved": "https://registry.npmjs.org/vscode-languageserver-protocol/-/vscode-languageserver-protocol-3.17.2-next.6.tgz",
@@ -2049,6 +2048,12 @@
         "yazl": "^2.2.2"
       },
       "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        },
         "xml2js": {
           "version": "0.5.0",
           "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -2895,9 +2900,9 @@
       "optional": true
     },
     "minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
       "requires": {
         "brace-expansion": "^1.1.7"
       }
@@ -2951,18 +2956,6 @@
       "optional": true,
       "requires": {
         "semver": "^7.3.5"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "dev": true,
-          "optional": true,
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "node-addon-api": {
@@ -3035,6 +3028,14 @@
       "dev": true,
       "requires": {
         "semver": "^5.1.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        }
       }
     },
     "parse5": {
@@ -3200,10 +3201,12 @@
       "dev": true
     },
     "semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "requires": {
+        "lru-cache": "^6.0.0"
+      }
     },
     "set-blocking": {
       "version": "2.0.0",
@@ -3454,16 +3457,6 @@
         "minimatch": "^3.0.4",
         "semver": "^7.3.5",
         "vscode-languageserver-protocol": "3.17.2-next.6"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "vscode-languageserver-protocol": {
diff --git a/mlir/utils/vscode/package.json b/mlir/utils/vscode/package.json
index 6d0f6f5c88adb..74f9ba37c7f16 100644
--- a/mlir/utils/vscode/package.json
+++ b/mlir/utils/vscode/package.json
@@ -39,6 +39,8 @@
   "dependencies": {
     "base64-js": "^1.5.1",
     "chokidar": "3.5.2",
+    "minimatch": "^3.0.5",
+    "semver": "^7.5.2",
     "vscode-languageclient": "^8.0.2-next.5"
   },
   "devDependencies": {
diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake
new file mode 100644
index 0000000000000..728dfe3f0a3f1
--- /dev/null
+++ b/offload/cmake/caches/AMDGPULibcBot.cmake
@@ -0,0 +1,20 @@
+set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
+
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(BUILD_SHARED_LIBS ON CACHE BOOL "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
+set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "compiler-rt;libunwind;openmp;offload" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+
+set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
+
+set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
+set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 2c9f3e56e3113..450157758d75b 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -49,7 +49,6 @@ cc_library(
         "//llvm:CodeGen",
         "//llvm:Core",
         "//llvm:DebugInfoDWARF",
-        "//llvm:IRPrinter",
         "//llvm:Option",
         "//llvm:ProfileData",
         "//llvm:Support",
@@ -153,7 +152,6 @@ cc_library(
         "//llvm:Option",
         "//llvm:Support",
         "//llvm:Symbolize",
-        "//llvm:Target",
         "//llvm:TargetParser",
         "//llvm:TransformUtils",
         "//llvm:WindowsDriver",
@@ -210,15 +208,11 @@ cc_library(
         "//llvm:BitReader",
         "//llvm:BitWriter",
         "//llvm:CGData",
-        "//llvm:Core",
         "//llvm:DebugInfoDWARF",
         "//llvm:Demangle",
         "//llvm:LTO",
-        "//llvm:MC",
-        "//llvm:ObjCARC",
         "//llvm:Object",
         "//llvm:Option",
-        "//llvm:ProfileData",
         "//llvm:Support",
         "//llvm:TargetParser",
         "//llvm:TextAPI",
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index d9d3666a3ecce..7cb4b7e9ffe75 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -43,6 +43,7 @@ posix_defines = [
     "HAVE_SETENV_R=1",
     "HAVE_STRERROR_R=1",
     "HAVE_SYSEXITS_H=1",
+    "HAVE_SYS_IOCTL_H=1",
     "HAVE_UNISTD_H=1",
 ]
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index 3ef1d0c4b1651..feac6a9d3308f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -171,6 +171,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #define HAVE_SYS_MMAN_H 1
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+/* HAVE_SYS_IOCTL_H defined in Bazel */
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 /* #undef HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC */
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 5dd53cffb7bd7..8a9c74d67b124 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -132,10 +132,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 0
 
 #endif
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e7398a696beaa..cb0f9d8c7413c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3512,6 +3512,7 @@ cc_library(
     deps = [
         ":AffineUtils",
         ":Analysis",
+        ":ArithDialect",
         ":ArithUtils",
         ":DialectUtils",
         ":FunctionInterfaces",
@@ -3521,6 +3522,7 @@ cc_library(
         ":IndexDialect",
         ":InliningUtils",
         ":LoopLikeInterface",
+        ":MathDialect",
         ":MemRefDialect",
         ":Pass",
         ":SCFTransforms",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 53405a0dea24a..a2fb5ade73247 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -37,6 +37,7 @@ expand_template(
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
         "@LLVM_HAS_NVPTX_TARGET@": "0",
+        "@LLVM_INCLUDE_SPIRV_TOOLS_TESTS@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 06d4756397911..ce83de8e4cba9 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 6d3c37cc8b194..a0ad517a6ecf4 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif

From 0f4579789c5fd6633b2b4c8bd32cb6051f8a6ae8 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 3 Jul 2025 00:28:40 -0500
Subject: [PATCH 58/58] Build fail due to RemoveBreakpoint change in llvm:main

 Changes to be committed:
	modified:   lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
---
 lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
index ace9e11927bee..326495625e74d 100644
--- a/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
+++ b/lldb/source/Plugins/Process/AIX/NativeProcessAIX.cpp
@@ -1696,10 +1696,12 @@ void NativeProcessAIX::SignalIfAllThreadsStopped() {
   // Clear any temporary breakpoints we used to implement software single
   // stepping.
   for (const auto &thread_info : m_threads_stepping_with_breakpoint) {
-    Status error = RemoveBreakpoint(thread_info.second);
-    if (error.Fail())
-      LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}",
-               thread_info.first, error);
+    for (auto &&bp_addr : thread_info.second) {
+      Status error = RemoveBreakpoint(bp_addr);
+      if (error.Fail())
+        LLDB_LOG(log, "pid = {0} remove stepping breakpoint: {1}",
+                 thread_info.first, error);
+    }
   }
   m_threads_stepping_with_breakpoint.clear();